From d2499ead93dc4298c0882fe98902acb1b5209f99 Mon Sep 17 00:00:00 2001
From: trav90 <travawine@palemoon.org>
Date: Fri, 19 Oct 2018 23:05:00 -0500
Subject: Update libaom to commit ID 1e227d41f0616de9548a673a83a21ef990b62591

---
 third_party/aom/av1/av1.cmake                      |   14 +-
 third_party/aom/av1/av1_cx_iface.c                 |  160 +-
 third_party/aom/av1/av1_dx_iface.c                 |   62 +-
 third_party/aom/av1/av1_iface_common.h             |    7 +-
 third_party/aom/av1/common/alloccommon.h           |    6 +-
 third_party/aom/av1/common/arm/av1_inv_txfm_neon.c | 2447 ++++++++-
 third_party/aom/av1/common/arm/av1_inv_txfm_neon.h |    8 +-
 .../aom/av1/common/arm/blend_a64_hmask_neon.c      |    4 +-
 .../aom/av1/common/arm/blend_a64_vmask_neon.c      |    4 +-
 third_party/aom/av1/common/arm/cfl_neon.c          |    4 +-
 third_party/aom/av1/common/arm/convolve_neon.c     |  363 +-
 third_party/aom/av1/common/arm/convolve_neon.h     |    6 +-
 third_party/aom/av1/common/arm/jnt_convolve_neon.c |  512 +-
 third_party/aom/av1/common/arm/mem_neon.h          |   15 +-
 third_party/aom/av1/common/arm/selfguided_neon.c   |   18 +-
 third_party/aom/av1/common/arm/transpose_neon.h    |   83 +-
 third_party/aom/av1/common/arm/warp_plane_neon.c   |  714 +++
 .../aom/av1/common/arm/wiener_convolve_neon.c      |  145 +-
 third_party/aom/av1/common/av1_inv_txfm1d.c        |  140 +-
 third_party/aom/av1/common/av1_inv_txfm1d.h        |    6 +-
 third_party/aom/av1/common/av1_inv_txfm1d_cfg.h    |    6 +-
 third_party/aom/av1/common/av1_loopfilter.c        |  945 +++-
 third_party/aom/av1/common/av1_loopfilter.h        |  120 +-
 third_party/aom/av1/common/av1_rtcd_defs.pl        |   46 +-
 third_party/aom/av1/common/av1_txfm.c              |   50 +
 third_party/aom/av1/common/av1_txfm.h              |   32 +-
 third_party/aom/av1/common/blockd.c                |   64 +-
 third_party/aom/av1/common/blockd.h                |   53 +-
 third_party/aom/av1/common/cdef.h                  |    6 +-
 third_party/aom/av1/common/cdef_block.h            |    6 +-
 third_party/aom/av1/common/cdef_block_simd.h       |    5 +
 third_party/aom/av1/common/cfl.h                   |    6 +-
 third_party/aom/av1/common/common.h                |    6 +-
 third_party/aom/av1/common/common_data.h           |   75 +-
 third_party/aom/av1/common/convolve.c              |  116 +-
 third_party/aom/av1/common/convolve.h              |   21 +-
 third_party/aom/av1/common/entropy.h               |    6 +-
 third_party/aom/av1/common/entropymode.h           |    8 +-
 third_party/aom/av1/common/entropymv.c             |   55 -
 third_party/aom/av1/common/entropymv.h             |   16 +-
 third_party/aom/av1/common/enums.h                 |   12 +-
 third_party/aom/av1/common/filter.h                |   22 +-
 third_party/aom/av1/common/frame_buffers.c         |   11 +
 third_party/aom/av1/common/frame_buffers.h         |   12 +-
 third_party/aom/av1/common/idct.c                  |  274 +-
 third_party/aom/av1/common/idct.h                  |   31 +-
 third_party/aom/av1/common/mv.h                    |    8 +-
 third_party/aom/av1/common/mvref_common.c          |  381 +-
 third_party/aom/av1/common/mvref_common.h          |   43 +-
 third_party/aom/av1/common/obmc.h                  |   14 +-
 third_party/aom/av1/common/obu_util.c              |  147 +
 third_party/aom/av1/common/obu_util.h              |   47 +
 third_party/aom/av1/common/odintrin.h              |   12 +-
 third_party/aom/av1/common/onyxc_int.h             |   29 +-
 third_party/aom/av1/common/ppc/cfl_ppc.c           |   85 +-
 third_party/aom/av1/common/pred_common.c           |    4 +-
 third_party/aom/av1/common/pred_common.h           |    6 +-
 third_party/aom/av1/common/quant_common.h          |    6 +-
 third_party/aom/av1/common/reconinter.c            |  652 +--
 third_party/aom/av1/common/reconinter.h            |  148 +-
 third_party/aom/av1/common/reconintra.h            |    6 +-
 third_party/aom/av1/common/resize.c                |   39 +-
 third_party/aom/av1/common/resize.h                |    6 +-
 third_party/aom/av1/common/restoration.c           |  131 +-
 third_party/aom/av1/common/restoration.h           |    7 +-
 third_party/aom/av1/common/scale.h                 |    7 +-
 third_party/aom/av1/common/scan.h                  |    6 +-
 third_party/aom/av1/common/seg_common.h            |    6 +-
 third_party/aom/av1/common/thread_common.c         |   18 +-
 third_party/aom/av1/common/thread_common.h         |    6 +-
 third_party/aom/av1/common/tile_common.c           |   16 +
 third_party/aom/av1/common/tile_common.h           |    9 +-
 third_party/aom/av1/common/timing.h                |    6 +-
 third_party/aom/av1/common/token_cdfs.h            |    5 +
 third_party/aom/av1/common/txb_common.h            |  243 +-
 third_party/aom/av1/common/warped_motion.c         |    4 +-
 third_party/aom/av1/common/warped_motion.h         |    6 +-
 .../aom/av1/common/x86/av1_convolve_scale_sse4.c   |    1 -
 third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c |    6 +
 third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h |    6 +-
 .../aom/av1/common/x86/av1_inv_txfm_ssse3.c        |    6 +
 .../aom/av1/common/x86/av1_inv_txfm_ssse3.h        |   10 +-
 third_party/aom/av1/common/x86/av1_txfm_sse2.h     |    6 +-
 third_party/aom/av1/common/x86/av1_txfm_sse4.h     |   11 +-
 third_party/aom/av1/common/x86/cfl_simd.h          |    5 +
 third_party/aom/av1/common/x86/convolve_2d_avx2.c  |    2 -
 third_party/aom/av1/common/x86/convolve_2d_sse2.c  |    3 +-
 third_party/aom/av1/common/x86/convolve_sse2.c     |   11 +-
 .../aom/av1/common/x86/highbd_convolve_2d_avx2.c   |    1 -
 .../aom/av1/common/x86/highbd_convolve_2d_sse4.c   |    1 -
 .../aom/av1/common/x86/highbd_convolve_2d_ssse3.c  |    1 -
 .../aom/av1/common/x86/highbd_inv_txfm_avx2.c      | 1117 +++-
 .../aom/av1/common/x86/highbd_inv_txfm_sse4.c      | 5335 +++++++++++++++-----
 .../aom/av1/common/x86/highbd_jnt_convolve_avx2.c  |    1 -
 .../aom/av1/common/x86/highbd_txfm_utility_sse4.h  |   28 +-
 .../aom/av1/common/x86/highbd_warp_plane_sse4.c    |  268 +-
 third_party/aom/av1/common/x86/jnt_convolve_avx2.c |  211 +-
 third_party/aom/av1/common/x86/reconinter_avx2.c   |  496 ++
 third_party/aom/av1/common/x86/selfguided_avx2.c   |   23 +-
 third_party/aom/av1/common/x86/selfguided_sse4.c   |   24 +-
 third_party/aom/av1/common/x86/warp_plane_sse4.c   |  809 ++-
 .../aom/av1/common/x86/wiener_convolve_avx2.c      |    3 +-
 .../aom/av1/common/x86/wiener_convolve_sse2.c      |    3 +-
 third_party/aom/av1/decoder/accounting.h           |    6 +-
 third_party/aom/av1/decoder/decodeframe.c          |  555 +-
 third_party/aom/av1/decoder/decodeframe.h          |    8 +-
 third_party/aom/av1/decoder/decodemv.c             |  179 +-
 third_party/aom/av1/decoder/decodemv.h             |    6 +-
 third_party/aom/av1/decoder/decoder.c              |   21 +-
 third_party/aom/av1/decoder/decoder.h              |   12 +-
 third_party/aom/av1/decoder/decodetxb.h            |    6 +-
 third_party/aom/av1/decoder/detokenize.h           |    6 +-
 third_party/aom/av1/decoder/dthread.h              |    6 +-
 third_party/aom/av1/decoder/inspection.h           |    6 +-
 third_party/aom/av1/decoder/obu.c                  |  176 +-
 third_party/aom/av1/decoder/obu.h                  |   29 +-
 third_party/aom/av1/encoder/aq_complexity.c        |    7 +-
 third_party/aom/av1/encoder/aq_complexity.h        |    6 +-
 third_party/aom/av1/encoder/aq_cyclicrefresh.c     |    8 +-
 third_party/aom/av1/encoder/aq_cyclicrefresh.h     |    6 +-
 third_party/aom/av1/encoder/aq_variance.c          |  179 +-
 third_party/aom/av1/encoder/aq_variance.h          |   10 +-
 third_party/aom/av1/encoder/av1_fwd_txfm1d.c       |  147 +-
 third_party/aom/av1/encoder/av1_fwd_txfm1d.h       |    6 +-
 third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h   |    6 +-
 third_party/aom/av1/encoder/av1_quantize.c         |   68 +-
 third_party/aom/av1/encoder/av1_quantize.h         |    6 +-
 third_party/aom/av1/encoder/bitstream.c            |  312 +-
 third_party/aom/av1/encoder/bitstream.h            |   16 +-
 third_party/aom/av1/encoder/block.h                |   53 +-
 third_party/aom/av1/encoder/blockiness.c           |    1 -
 third_party/aom/av1/encoder/context_tree.c         |   17 +-
 third_party/aom/av1/encoder/context_tree.h         |    8 +-
 third_party/aom/av1/encoder/corner_detect.h        |    6 +-
 third_party/aom/av1/encoder/corner_match.h         |    6 +-
 third_party/aom/av1/encoder/cost.h                 |    6 +-
 third_party/aom/av1/encoder/dwt.h                  |    5 +
 third_party/aom/av1/encoder/encodeframe.c          | 1075 ++--
 third_party/aom/av1/encoder/encodeframe.h          |   11 +-
 third_party/aom/av1/encoder/encodemb.c             |   71 +-
 third_party/aom/av1/encoder/encodemb.h             |   17 +-
 third_party/aom/av1/encoder/encodemv.c             |   26 +-
 third_party/aom/av1/encoder/encodemv.h             |   14 +-
 third_party/aom/av1/encoder/encoder.c              |  490 +-
 third_party/aom/av1/encoder/encoder.h              |  105 +-
 third_party/aom/av1/encoder/encodetxb.c            |   48 +-
 third_party/aom/av1/encoder/encodetxb.h            |    6 +-
 third_party/aom/av1/encoder/ethread.c              |  252 +-
 third_party/aom/av1/encoder/ethread.h              |    6 +-
 third_party/aom/av1/encoder/extend.h               |    6 +-
 third_party/aom/av1/encoder/firstpass.c            |  992 +---
 third_party/aom/av1/encoder/firstpass.h            |   19 +-
 third_party/aom/av1/encoder/global_motion.c        |    4 +-
 third_party/aom/av1/encoder/global_motion.h        |    6 +-
 third_party/aom/av1/encoder/grain_test_vectors.h   |    6 +-
 third_party/aom/av1/encoder/hash.h                 |    8 +-
 third_party/aom/av1/encoder/hash_motion.c          |   94 +-
 third_party/aom/av1/encoder/hash_motion.h          |   16 +-
 third_party/aom/av1/encoder/hybrid_fwd_txfm.c      |   38 +-
 third_party/aom/av1/encoder/hybrid_fwd_txfm.h      |    6 +-
 third_party/aom/av1/encoder/lookahead.h            |    6 +-
 third_party/aom/av1/encoder/mathutils.h            |    7 +-
 third_party/aom/av1/encoder/mbgraph.c              |    7 +-
 third_party/aom/av1/encoder/mbgraph.h              |    6 +-
 third_party/aom/av1/encoder/mcomp.c                |  231 +-
 third_party/aom/av1/encoder/mcomp.h                |   33 +-
 third_party/aom/av1/encoder/ml.c                   |   16 +
 third_party/aom/av1/encoder/ml.h                   |   11 +-
 third_party/aom/av1/encoder/palette.h              |    6 +-
 .../aom/av1/encoder/partition_model_weights.h      | 1457 ++++--
 third_party/aom/av1/encoder/picklpf.c              |    3 +-
 third_party/aom/av1/encoder/picklpf.h              |    6 +-
 third_party/aom/av1/encoder/pickrst.c              |  128 +-
 third_party/aom/av1/encoder/pickrst.h              |   23 +-
 third_party/aom/av1/encoder/pustats.h              |  183 +-
 third_party/aom/av1/encoder/random.h               |    6 +-
 third_party/aom/av1/encoder/ransac.h               |    6 +-
 .../aom/av1/encoder/rate_distortion_model_params.h |    6 +-
 third_party/aom/av1/encoder/ratectrl.c             |   96 +-
 third_party/aom/av1/encoder/ratectrl.h             |   29 +-
 third_party/aom/av1/encoder/ratectrl_xiph.c        |    0
 third_party/aom/av1/encoder/ratectrl_xiph.h        |    0
 third_party/aom/av1/encoder/rd.c                   |  494 +-
 third_party/aom/av1/encoder/rd.h                   |   48 +-
 third_party/aom/av1/encoder/rdopt.c                | 4751 ++++++++++-------
 third_party/aom/av1/encoder/rdopt.h                |   18 +-
 third_party/aom/av1/encoder/reconinter_enc.c       |  627 +++
 third_party/aom/av1/encoder/reconinter_enc.h       |  127 +
 third_party/aom/av1/encoder/segmentation.h         |    6 +-
 third_party/aom/av1/encoder/speed_features.c       |  145 +-
 third_party/aom/av1/encoder/speed_features.h       |  144 +-
 third_party/aom/av1/encoder/temporal_filter.c      |   81 +-
 third_party/aom/av1/encoder/temporal_filter.h      |    6 +-
 third_party/aom/av1/encoder/tokenize.h             |    6 +-
 .../aom/av1/encoder/tx_prune_model_weights.h       | 1482 +++---
 .../aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c      |  259 +-
 .../aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c      |   75 +-
 .../aom/av1/encoder/x86/av1_fwd_txfm_avx2.h        |    6 +-
 .../aom/av1/encoder/x86/av1_fwd_txfm_sse2.h        |    6 +-
 third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h  |    9 +-
 third_party/aom/av1/encoder/x86/encodetxb_avx2.c   |  130 +
 third_party/aom/av1/encoder/x86/encodetxb_sse4.c   |   46 +-
 .../aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c     |  344 +-
 third_party/aom/av1/encoder/x86/pickrst_avx2.c     |  403 ++
 third_party/aom/av1/encoder/x86/pickrst_sse4.c     |  389 ++
 third_party/aom/av1/encoder/x86/wedge_utils_avx2.c |    4 +-
 third_party/aom/av1/exports_com                    |    2 +
 third_party/aom/av1/exports_dec                    |    1 +
 third_party/aom/av1/exports_test                   |    2 +
 209 files changed, 23499 insertions(+), 10802 deletions(-)
 create mode 100644 third_party/aom/av1/common/arm/warp_plane_neon.c
 create mode 100644 third_party/aom/av1/common/obu_util.c
 create mode 100644 third_party/aom/av1/common/obu_util.h
 delete mode 100644 third_party/aom/av1/encoder/ratectrl_xiph.c
 delete mode 100644 third_party/aom/av1/encoder/ratectrl_xiph.h
 create mode 100644 third_party/aom/av1/encoder/reconinter_enc.c
 create mode 100644 third_party/aom/av1/encoder/reconinter_enc.h
 create mode 100644 third_party/aom/av1/encoder/x86/encodetxb_avx2.c
 create mode 100644 third_party/aom/av1/encoder/x86/pickrst_avx2.c
 create mode 100644 third_party/aom/av1/encoder/x86/pickrst_sse4.c
 create mode 100644 third_party/aom/av1/exports_com
 create mode 100644 third_party/aom/av1/exports_test

(limited to 'third_party/aom/av1')

diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake
index 4c4f542fe..3a7cd7ee1 100644
--- a/third_party/aom/av1/av1.cmake
+++ b/third_party/aom/av1/av1.cmake
@@ -53,6 +53,8 @@ list(APPEND AOM_AV1_COMMON_SOURCES
             "${AOM_ROOT}/av1/common/mv.h"
             "${AOM_ROOT}/av1/common/mvref_common.c"
             "${AOM_ROOT}/av1/common/mvref_common.h"
+            "${AOM_ROOT}/av1/common/obu_util.c"
+            "${AOM_ROOT}/av1/common/obu_util.h"
             "${AOM_ROOT}/av1/common/odintrin.c"
             "${AOM_ROOT}/av1/common/odintrin.h"
             "${AOM_ROOT}/av1/common/onyxc_int.h"
@@ -78,8 +80,8 @@ list(APPEND AOM_AV1_COMMON_SOURCES
             "${AOM_ROOT}/av1/common/thread_common.h"
             "${AOM_ROOT}/av1/common/tile_common.c"
             "${AOM_ROOT}/av1/common/tile_common.h"
-            "${AOM_ROOT}/av1/common/timing.h"
             "${AOM_ROOT}/av1/common/timing.c"
+            "${AOM_ROOT}/av1/common/timing.h"
             "${AOM_ROOT}/av1/common/token_cdfs.h"
             "${AOM_ROOT}/av1/common/txb_common.c"
             "${AOM_ROOT}/av1/common/txb_common.h"
@@ -176,6 +178,8 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
             "${AOM_ROOT}/av1/encoder/rd.h"
             "${AOM_ROOT}/av1/encoder/rdopt.c"
             "${AOM_ROOT}/av1/encoder/rdopt.h"
+            "${AOM_ROOT}/av1/encoder/reconinter_enc.c"
+            "${AOM_ROOT}/av1/encoder/reconinter_enc.h"
             "${AOM_ROOT}/av1/encoder/segmentation.c"
             "${AOM_ROOT}/av1/encoder/segmentation.h"
             "${AOM_ROOT}/av1/encoder/speed_features.c"
@@ -268,7 +272,8 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
             "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c"
-            "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c")
+            "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
             "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
@@ -276,7 +281,9 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
             "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h"
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c"
-            "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c")
+            "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
             "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c")
@@ -301,6 +308,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_NEON
             "${AOM_ROOT}/av1/common/arm/selfguided_neon.c"
             "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
             "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h"
+            "${AOM_ROOT}/av1/common/arm/warp_plane_neon.c"
             "${AOM_ROOT}/av1/common/cdef_block_neon.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
diff --git a/third_party/aom/av1/av1_cx_iface.c b/third_party/aom/av1/av1_cx_iface.c
index 3bc4804c9..3295f618a 100644
--- a/third_party/aom/av1/av1_cx_iface.c
+++ b/third_party/aom/av1/av1_cx_iface.c
@@ -14,28 +14,29 @@
 #include "config/aom_config.h"
 #include "config/aom_version.h"
 
-#include "aom/aom_encoder.h"
 #include "aom_ports/aom_once.h"
+#include "aom_ports/mem_ops.h"
 #include "aom_ports/system_state.h"
+
+#include "aom/aom_encoder.h"
 #include "aom/internal/aom_codec_internal.h"
-#include "av1/encoder/encoder.h"
-#include "aom/aomcx.h"
-#include "av1/encoder/firstpass.h"
+
 #include "av1/av1_iface_common.h"
 #include "av1/encoder/bitstream.h"
-#include "aom_ports/mem_ops.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
 
 #define MAG_SIZE (4)
 #define MAX_NUM_ENHANCEMENT_LAYERS 3
 
 struct av1_extracfg {
   int cpu_used;  // available cpu percentage in 1/16
-  int dev_sf;
   unsigned int enable_auto_alt_ref;
   unsigned int enable_auto_bwd_ref;
   unsigned int noise_sensitivity;
   unsigned int sharpness;
   unsigned int static_thresh;
+  unsigned int row_mt;
   unsigned int tile_columns;  // log2 number of tile columns
   unsigned int tile_rows;     // log2 number of tile rows
   unsigned int arnr_max_frames;
@@ -98,37 +99,40 @@ struct av1_extracfg {
   float noise_level;
   int noise_block_size;
 #endif
+
+  unsigned int chroma_subsampling_x;
+  unsigned int chroma_subsampling_y;
 };
 
 static struct av1_extracfg default_extra_cfg = {
-  0,                 // cpu_used
-  0,                 // dev_sf
-  1,                 // enable_auto_alt_ref
-  0,                 // enable_auto_bwd_ref
-  0,                 // noise_sensitivity
-  0,                 // sharpness
-  0,                 // static_thresh
-  0,                 // tile_columns
-  0,                 // tile_rows
-  7,                 // arnr_max_frames
-  5,                 // arnr_strength
-  0,                 // min_gf_interval; 0 -> default decision
-  0,                 // max_gf_interval; 0 -> default decision
-  AOM_TUNE_PSNR,     // tuning
-  10,                // cq_level
-  0,                 // rc_max_intra_bitrate_pct
-  0,                 // rc_max_inter_bitrate_pct
-  0,                 // gf_cbr_boost_pct
-  0,                 // lossless
-  1,                 // enable_cdef
-  1,                 // enable_restoration
-  0,                 // disable_trellis_quant
-  0,                 // enable_qm
-  DEFAULT_QM_Y,      // qm_y
-  DEFAULT_QM_U,      // qm_u
-  DEFAULT_QM_V,      // qm_v
-  DEFAULT_QM_FIRST,  // qm_min
-  DEFAULT_QM_LAST,   // qm_max
+  0,                       // cpu_used
+  1,                       // enable_auto_alt_ref
+  0,                       // enable_auto_bwd_ref
+  0,                       // noise_sensitivity
+  CONFIG_SHARP_SETTINGS,   // sharpness
+  0,                       // static_thresh
+  0,                       // row_mt
+  0,                       // tile_columns
+  0,                       // tile_rows
+  7,                       // arnr_max_frames
+  5,                       // arnr_strength
+  0,                       // min_gf_interval; 0 -> default decision
+  0,                       // max_gf_interval; 0 -> default decision
+  AOM_TUNE_PSNR,           // tuning
+  10,                      // cq_level
+  0,                       // rc_max_intra_bitrate_pct
+  0,                       // rc_max_inter_bitrate_pct
+  0,                       // gf_cbr_boost_pct
+  0,                       // lossless
+  !CONFIG_SHARP_SETTINGS,  // enable_cdef
+  1,                       // enable_restoration
+  0,                       // disable_trellis_quant
+  0,                       // enable_qm
+  DEFAULT_QM_Y,            // qm_y
+  DEFAULT_QM_U,            // qm_u
+  DEFAULT_QM_V,            // qm_v
+  DEFAULT_QM_FIRST,        // qm_min
+  DEFAULT_QM_LAST,         // qm_max
 #if CONFIG_DIST_8X8
   0,
 #endif
@@ -150,7 +154,7 @@ static struct av1_extracfg default_extra_cfg = {
   0,                            // render width
   0,                            // render height
   AOM_SUPERBLOCK_SIZE_DYNAMIC,  // superblock_size
-  0,                            // Single tile decoding is off by default.
+  1,                            // this depends on large_scale_tile.
   0,                            // error_resilient_mode off by default.
   0,                            // s_frame_mode off by default.
   0,                            // film_grain_test_vector
@@ -168,6 +172,8 @@ static struct av1_extracfg default_extra_cfg = {
   0,   // noise_level
   32,  // noise_block_size
 #endif
+  0,  // chroma_subsampling_x
+  0,  // chroma_subsampling_y
 };
 
 struct aom_codec_alg_priv {
@@ -251,10 +257,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
   RANGE_CHECK_HI(extra_cfg, min_gf_interval, MAX_LAG_BUFFERS - 1);
   RANGE_CHECK_HI(extra_cfg, max_gf_interval, MAX_LAG_BUFFERS - 1);
   if (extra_cfg->max_gf_interval > 0) {
-    RANGE_CHECK(extra_cfg, max_gf_interval, 2, (MAX_LAG_BUFFERS - 1));
-  }
-  if (extra_cfg->min_gf_interval > 0 && extra_cfg->max_gf_interval > 0) {
-    RANGE_CHECK(extra_cfg, max_gf_interval, extra_cfg->min_gf_interval,
+    RANGE_CHECK(extra_cfg, max_gf_interval, MAX(2, extra_cfg->min_gf_interval),
                 (MAX_LAG_BUFFERS - 1));
   }
 
@@ -284,13 +287,14 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
   RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 2);
   RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2);
   RANGE_CHECK(extra_cfg, cpu_used, 0, 8);
-  RANGE_CHECK(extra_cfg, dev_sf, 0, UINT8_MAX);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
   RANGE_CHECK(extra_cfg, superblock_size, AOM_SUPERBLOCK_SIZE_64X64,
               AOM_SUPERBLOCK_SIZE_DYNAMIC);
   RANGE_CHECK_HI(cfg, large_scale_tile, 1);
   RANGE_CHECK_HI(extra_cfg, single_tile_decoding, 1);
 
+  RANGE_CHECK_HI(extra_cfg, row_mt, 1);
+
   RANGE_CHECK_HI(extra_cfg, tile_columns, 6);
   RANGE_CHECK_HI(extra_cfg, tile_rows, 6);
 
@@ -372,6 +376,9 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
 #endif
   }
 
+  RANGE_CHECK_HI(extra_cfg, chroma_subsampling_x, 1);
+  RANGE_CHECK_HI(extra_cfg, chroma_subsampling_y, 1);
+
   return AOM_CODEC_OK;
 }
 
@@ -581,7 +588,6 @@ static aom_codec_err_t set_encoder_config(
   oxcf->sframe_mode = cfg->sframe_mode;
   oxcf->sframe_enabled = cfg->sframe_dist != 0;
   oxcf->speed = extra_cfg->cpu_used;
-  oxcf->dev_sf = extra_cfg->dev_sf;
   oxcf->enable_auto_arf = extra_cfg->enable_auto_alt_ref;
   oxcf->enable_auto_brf = extra_cfg->enable_auto_bwd_ref;
   oxcf->noise_sensitivity = extra_cfg->noise_sensitivity;
@@ -637,6 +643,8 @@ static aom_codec_err_t set_encoder_config(
       oxcf->superblock_size = AOM_SUPERBLOCK_SIZE_64X64;
   }
 
+  oxcf->row_mt = extra_cfg->row_mt;
+
   oxcf->tile_columns = extra_cfg->tile_columns;
   oxcf->tile_rows = extra_cfg->tile_rows;
 
@@ -692,6 +700,24 @@ static aom_codec_err_t set_encoder_config(
 
   oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost;
   oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test;
+
+#if CONFIG_REDUCED_ENCODER_BORDER
+  if (oxcf->superres_mode != SUPERRES_NONE ||
+      oxcf->resize_mode != RESIZE_NONE) {
+    warn(
+        "Superres / resize cannot be used with CONFIG_REDUCED_ENCODER_BORDER. "
+        "Disabling superres/resize.\n");
+    // return AOM_CODEC_INVALID_PARAM;
+    disable_superres(oxcf);
+    oxcf->resize_mode = RESIZE_NONE;
+    oxcf->resize_scale_denominator = SCALE_NUMERATOR;
+    oxcf->resize_kf_scale_denominator = SCALE_NUMERATOR;
+  }
+#endif  // CONFIG_REDUCED_ENCODER_BORDER
+
+  oxcf->chroma_subsampling_x = extra_cfg->chroma_subsampling_x;
+  oxcf->chroma_subsampling_y = extra_cfg->chroma_subsampling_y;
+
   return AOM_CODEC_OK;
 }
 
@@ -731,6 +757,10 @@ static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx,
   return res;
 }
 
+static aom_fixed_buf_t *encoder_get_global_headers(aom_codec_alg_priv_t *ctx) {
+  return av1_get_global_headers(ctx->cpi);
+}
+
 static aom_codec_err_t ctrl_get_quantizer(aom_codec_alg_priv_t *ctx,
                                           va_list args) {
   int *const arg = va_arg(args, int *);
@@ -765,12 +795,6 @@ static aom_codec_err_t ctrl_set_cpuused(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-static aom_codec_err_t ctrl_set_devsf(aom_codec_alg_priv_t *ctx, va_list args) {
-  struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.dev_sf = CAST(AOME_SET_DEVSF, args);
-  return update_extra_cfg(ctx, &extra_cfg);
-}
-
 static aom_codec_err_t ctrl_set_enable_auto_alt_ref(aom_codec_alg_priv_t *ctx,
                                                     va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -806,6 +830,13 @@ static aom_codec_err_t ctrl_set_static_thresh(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.row_mt = CAST(AV1E_SET_ROW_MT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_tile_columns(aom_codec_alg_priv_t *ctx,
                                              va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1669,6 +1700,20 @@ static aom_codec_err_t ctrl_set_superblock_size(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_chroma_subsampling_x(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.chroma_subsampling_x = CAST(AV1E_SET_CHROMA_SUBSAMPLING_X, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_chroma_subsampling_y(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.chroma_subsampling_y = CAST(AV1E_SET_CHROMA_SUBSAMPLING_Y, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1_COPY_REFERENCE, ctrl_copy_reference },
   { AOME_USE_REFERENCE, ctrl_use_reference },
@@ -1681,11 +1726,11 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AOME_SET_SCALEMODE, ctrl_set_scale_mode },
   { AOME_SET_SPATIAL_LAYER_ID, ctrl_set_spatial_layer_id },
   { AOME_SET_CPUUSED, ctrl_set_cpuused },
-  { AOME_SET_DEVSF, ctrl_set_devsf },
   { AOME_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref },
   { AOME_SET_ENABLEAUTOBWDREF, ctrl_set_enable_auto_bwd_ref },
   { AOME_SET_SHARPNESS, ctrl_set_sharpness },
   { AOME_SET_STATIC_THRESHOLD, ctrl_set_static_thresh },
+  { AV1E_SET_ROW_MT, ctrl_set_row_mt },
   { AV1E_SET_TILE_COLUMNS, ctrl_set_tile_columns },
   { AV1E_SET_TILE_ROWS, ctrl_set_tile_rows },
   { AOME_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames },
@@ -1754,7 +1799,8 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_GET_ACTIVEMAP, ctrl_get_active_map },
   { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
   { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image },
-
+  { AV1E_SET_CHROMA_SUBSAMPLING_X, ctrl_set_chroma_subsampling_x },
+  { AV1E_SET_CHROMA_SUBSAMPLING_Y, ctrl_set_chroma_subsampling_y },
   { -1, NULL },
 };
 
@@ -1850,13 +1896,13 @@ CODEC_INTERFACE(aom_codec_av1_cx) = {
   },
   {
       // NOLINT
-      1,                      // 1 cfg map
-      encoder_usage_cfg_map,  // aom_codec_enc_cfg_map_t
-      encoder_encode,         // aom_codec_encode_fn_t
-      encoder_get_cxdata,     // aom_codec_get_cx_data_fn_t
-      encoder_set_config,     // aom_codec_enc_config_set_fn_t
-      NULL,                   // aom_codec_get_global_headers_fn_t
-      encoder_get_preview,    // aom_codec_get_preview_frame_fn_t
-      NULL                    // aom_codec_enc_mr_get_mem_loc_fn_t
+      1,                           // 1 cfg map
+      encoder_usage_cfg_map,       // aom_codec_enc_cfg_map_t
+      encoder_encode,              // aom_codec_encode_fn_t
+      encoder_get_cxdata,          // aom_codec_get_cx_data_fn_t
+      encoder_set_config,          // aom_codec_enc_config_set_fn_t
+      encoder_get_global_headers,  // aom_codec_get_global_headers_fn_t
+      encoder_get_preview,         // aom_codec_get_preview_frame_fn_t
+      NULL                         // aom_codec_enc_mr_get_mem_loc_fn_t
   }
 };
diff --git a/third_party/aom/av1/av1_dx_iface.c b/third_party/aom/av1/av1_dx_iface.c
index f42572019..4a6631047 100644
--- a/third_party/aom/av1/av1_dx_iface.c
+++ b/third_party/aom/av1/av1_dx_iface.c
@@ -26,6 +26,7 @@
 #include "av1/common/alloccommon.h"
 #include "av1/common/frame_buffers.h"
 #include "av1/common/enums.h"
+#include "av1/common/obu_util.h"
 
 #include "av1/decoder/decoder.h"
 #include "av1/decoder/decodeframe.h"
@@ -46,6 +47,7 @@ struct aom_codec_alg_priv {
   int last_show_frame;  // Index of last output frame.
   int byte_alignment;
   int skip_loop_filter;
+  int skip_film_grain;
   int decode_tile_row;
   int decode_tile_col;
   unsigned int tile_mode;
@@ -103,6 +105,15 @@ static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx,
       priv->cfg.cfg.ext_partition = 1;
     }
     av1_zero(priv->image_with_grain);
+    // Turn row_mt on by default.
+    priv->row_mt = 1;
+
+    // Turn on normal tile coding mode by default.
+    // 0 is for normal tile coding mode, and 1 is for large scale tile coding
+    // mode(refer to lightfield example).
+    priv->tile_mode = 0;
+    priv->decode_tile_row = -1;
+    priv->decode_tile_col = -1;
   }
 
   return AOM_CODEC_OK;
@@ -216,7 +227,7 @@ static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data,
   while (1) {
     data += bytes_read;
     data_sz -= bytes_read;
-    const uint8_t *payload_start = data;
+    if (data_sz < payload_size) return AOM_CODEC_CORRUPT_FRAME;
     // Check that the selected OBU is a sequence header
     if (obu_header.type == OBU_SEQUENCE_HEADER) {
       // Sanity check on sequence header size
@@ -264,9 +275,9 @@ static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data,
       }
     }
     // skip past any unread OBU header data
-    data = payload_start + payload_size;
+    data += payload_size;
     data_sz -= payload_size;
-    if (data_sz <= 0) break;  // exit if we're out of OBUs
+    if (data_sz == 0) break;  // exit if we're out of OBUs
     status = aom_read_obu_header_and_size(
         data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
     if (status != AOM_CODEC_OK) return status;
@@ -313,6 +324,7 @@ static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) {
     cm->new_fb_idx = INVALID_IDX;
     cm->byte_alignment = ctx->byte_alignment;
     cm->skip_loop_filter = ctx->skip_loop_filter;
+    cm->skip_film_grain = ctx->skip_film_grain;
 
     if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
       pool->get_fb_cb = ctx->get_ext_fb_cb;
@@ -434,7 +446,7 @@ static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
     frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
     frame_worker_data->pbi->row_mt = ctx->row_mt;
 
-    worker->hook = (AVxWorkerHook)frame_worker_hook;
+    worker->hook = frame_worker_hook;
     if (!winterface->reset(worker)) {
       set_error_detail(ctx, "Frame Worker thread creation failed");
       return AOM_CODEC_MEM_ERROR;
@@ -515,12 +527,11 @@ static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx,
 static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
                                       const uint8_t *data, size_t data_sz,
                                       void *user_priv) {
-  const uint8_t *data_start = data;
-  const uint8_t *data_end = data + data_sz;
   aom_codec_err_t res = AOM_CODEC_OK;
 
-  // Release any pending output frames from the previous decoder call.
-  // We need to do this even if the decoder is being flushed
+  // Release any pending output frames from the previous decoder_decode call.
+  // We need to do this even if the decoder is being flushed or the input
+  // arguments are invalid.
   if (ctx->frame_workers) {
     BufferPool *const pool = ctx->buffer_pool;
     RefCntBuffer *const frame_bufs = pool->frame_bufs;
@@ -538,10 +549,13 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
     unlock_buffer_pool(ctx->buffer_pool);
   }
 
+  /* Sanity checks */
+  /* NULL data ptr allowed if data_sz is 0 too */
   if (data == NULL && data_sz == 0) {
     ctx->flushed = 1;
     return AOM_CODEC_OK;
   }
+  if (data == NULL || data_sz == 0) return AOM_CODEC_INVALID_PARAM;
 
   // Reset flushed when receiving a valid frame.
   ctx->flushed = 0;
@@ -552,6 +566,9 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
     if (res != AOM_CODEC_OK) return res;
   }
 
+  const uint8_t *data_start = data;
+  const uint8_t *data_end = data + data_sz;
+
   if (ctx->is_annexb) {
     // read the size of this temporal unit
     size_t length_of_size;
@@ -617,6 +634,7 @@ static aom_image_t *add_grain_if_needed(aom_image_t *img,
         img->fmt != grain_img_buf->fmt) {
       aom_img_free(grain_img_buf);
       grain_img_buf = NULL;
+      *grain_img_ptr = NULL;
     }
   }
   if (!grain_img_buf) {
@@ -624,7 +642,14 @@ static aom_image_t *add_grain_if_needed(aom_image_t *img,
     *grain_img_ptr = grain_img_buf;
   }
 
-  av1_add_film_grain(grain_params, img, grain_img_buf);
+  if (grain_img_buf) {
+    grain_img_buf->user_priv = img->user_priv;
+    if (av1_add_film_grain(grain_params, img, grain_img_buf)) {
+      aom_img_free(grain_img_buf);
+      grain_img_buf = NULL;
+      *grain_img_ptr = NULL;
+    }
+  }
 
   return grain_img_buf;
 }
@@ -720,8 +745,13 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
           img = &ctx->img;
           img->temporal_id = cm->temporal_layer_id;
           img->spatial_id = cm->spatial_layer_id;
+          if (cm->skip_film_grain) grain_params->apply_grain = 0;
           aom_image_t *res = add_grain_if_needed(
               img, &ctx->image_with_grain[*index], grain_params);
+          if (!res) {
+            aom_internal_error(&pbi->common.error, AOM_CODEC_CORRUPT_FRAME,
+                               "Grain systhesis failed\n");
+          }
           *index += 1;  // Advance the iterator to point to the next image
           return res;
         }
@@ -1128,6 +1158,19 @@ static aom_codec_err_t ctrl_set_skip_loop_filter(aom_codec_alg_priv_t *ctx,
   return AOM_CODEC_OK;
 }
 
+static aom_codec_err_t ctrl_set_skip_film_grain(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->skip_film_grain = va_arg(args, int);
+
+  if (ctx->frame_workers) {
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi->common.skip_film_grain = ctx->skip_film_grain;
+  }
+
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_err_t ctrl_get_accounting(aom_codec_alg_priv_t *ctx,
                                            va_list args) {
 #if !CONFIG_ACCOUNTING
@@ -1231,6 +1274,7 @@ static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { AV1D_EXT_TILE_DEBUG, ctrl_ext_tile_debug },
   { AV1D_SET_ROW_MT, ctrl_set_row_mt },
   { AV1D_SET_EXT_REF_PTR, ctrl_set_ext_ref_ptr },
+  { AV1D_SET_SKIP_FILM_GRAIN, ctrl_set_skip_film_grain },
 
   // Getters
   { AOMD_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted },
diff --git a/third_party/aom/av1/av1_iface_common.h b/third_party/aom/av1/av1_iface_common.h
index c03892b73..4a7af580b 100644
--- a/third_party/aom/av1/av1_iface_common.h
+++ b/third_party/aom/av1/av1_iface_common.h
@@ -8,10 +8,11 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AV1_AV1_IFACE_COMMON_H_
-#define AV1_AV1_IFACE_COMMON_H_
+#ifndef AOM_AV1_AV1_IFACE_COMMON_H_
+#define AOM_AV1_AV1_IFACE_COMMON_H_
 
 #include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
 
 static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12,
                             void *user_priv) {
@@ -132,4 +133,4 @@ static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
   return AOM_CODEC_OK;
 }
 
-#endif  // AV1_AV1_IFACE_COMMON_H_
+#endif  // AOM_AV1_AV1_IFACE_COMMON_H_
diff --git a/third_party/aom/av1/common/alloccommon.h b/third_party/aom/av1/common/alloccommon.h
index dbcb5b947..8e5896981 100644
--- a/third_party/aom/av1/common/alloccommon.h
+++ b/third_party/aom/av1/common/alloccommon.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_ALLOCCOMMON_H_
-#define AV1_COMMON_ALLOCCOMMON_H_
+#ifndef AOM_AV1_COMMON_ALLOCCOMMON_H_
+#define AOM_AV1_COMMON_ALLOCCOMMON_H_
 
 #define INVALID_IDX -1  // Invalid buffer index.
 
@@ -45,4 +45,4 @@ int av1_get_MBs(int width, int height);
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_ALLOCCOMMON_H_
+#endif  // AOM_AV1_COMMON_ALLOCCOMMON_H_
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
index 51c991498..bad411743 100644
--- a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <arm_neon.h>
+
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 #include "config/av1_rtcd.h"
@@ -19,19 +21,7 @@
 #include "av1/common/enums.h"
 #include "av1/common/idct.h"
 #include "av1/common/arm/av1_inv_txfm_neon.h"
-
-static INLINE TxSetType find_TxSetType(TX_SIZE tx_size) {
-  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
-  TxSetType tx_set_type;
-  if (tx_size_sqr_up > TX_32X32) {
-    tx_set_type = EXT_TX_SET_DCTONLY;
-  } else if (tx_size_sqr_up == TX_32X32) {
-    tx_set_type = EXT_TX_SET_DCT_IDTX;
-  } else {
-    tx_set_type = EXT_TX_SET_ALL16;
-  }
-  return tx_set_type;
-}
+#include "av1/common/arm/transpose_neon.h"
 
 // 1D itx types
 typedef enum ATTRIBUTE_PACKED {
@@ -65,6 +55,2038 @@ static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
   { av1_idct64_new, NULL, NULL },
 };
 
+static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
+                                                  uint8_t *output, int stride,
+                                                  int flipud,
+                                                  const int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  int16x8_t temp_output;
+  for (int i = 0; i < height; ++i, j += step) {
+    temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output)));
+    temp_output = vaddq_s16(temp_output, in[j]);
+    vst1_u8(output, vqmovun_s16(temp_output));
+    output += stride;
+  }
+}
+
+static INLINE uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred,
+                                                    int16x8_t res0,
+                                                    int16x8_t res1) {
+  int16x8_t temp_output[2];
+  uint8x16_t temp_output_8q;
+  temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred)));
+  temp_output[0] = vaddq_s16(temp_output[0], res0);
+  temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred)));
+  temp_output[1] = vaddq_s16(temp_output[1], res1);
+  temp_output_8q =
+      vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1]));
+  return temp_output_8q;
+}
+
+static INLINE void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in,
+                                                   uint8_t *output, int stride,
+                                                   int flipud, int height) {
+  uint8x16_t temp_output_8q;
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    temp_output_8q = vld1q_u8(output + i * stride);
+    temp_output_8q =
+        lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]);
+    vst1q_u8((output + i * stride), temp_output_8q);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size,
+                                                int value) {
+  for (int i = 0; i < size; i++) {
+    a[i] = vdupq_n_s16((int16_t)value);
+  }
+}
+
+static INLINE void btf_16_lane_0_1_neon(const int16x8_t in0,
+                                        const int16x8_t in1, const int16x4_t c,
+                                        int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0[2], s1[2];
+  int16x4_t v0[2], v1[2];
+
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
+
+  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_1_0_neon(const int16x8_t in0,
+                                        const int16x8_t in1, const int16x4_t c,
+                                        int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0[2], s1[2];
+  int16x4_t v0[2], v1[2];
+
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1);
+
+  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_2_3_neon(const int16x8_t in0,
+                                        const int16x8_t in1, const int16x4_t c,
+                                        int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0[2], s1[2];
+  int16x4_t v0[2], v1[2];
+
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
+
+  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_neon(const int16x8_t in0, int16_t coef1,
+                               int16_t coef2, int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0_l, s0_h, s1_l, s1_h;
+  int16x4_t v0[2], v1[2];
+
+  s0_l = vmull_n_s16(vget_low_s16(in0), coef1);
+  s0_h = vmull_n_s16(vget_high_s16(in0), coef1);
+  s1_l = vmull_n_s16(vget_low_s16(in0), coef2);
+  s1_h = vmull_n_s16(vget_high_s16(in0), coef2);
+
+  v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_3_2_neon(const int16x8_t in0,
+                                        const int16x8_t in1, const int16x4_t c,
+                                        int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0[2], s1[2];
+  int16x4_t v0[2], v1[2];
+
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
+
+  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
+  int32x4_t t0[2], t1[2];
+  int16x4_t v0[2], v1[2];
+
+  // Don't add/sub before multiply, which will overflow in iadst8.
+  const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0);
+  const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0);
+  const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0);
+  const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0);
+
+  t0[0] = vaddq_s32(x0_lo, x1_lo);
+  t0[1] = vaddq_s32(x0_hi, x1_hi);
+  t1[0] = vsubq_s32(x0_lo, x1_lo);
+  t1[1] = vsubq_s32(x0_hi, x1_hi);
+
+  v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT);
+
+  x[0] = vcombine_s16(v0[0], v0[1]);
+  x[1] = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE int16x4_t create_s16x4_neon(int16_t *const c0, int16_t *const c1,
+                                          int16_t *const c2,
+                                          int16_t *const c3) {
+  int16x4_t val = vdup_n_s16((int16_t)0);
+  val = vld1_lane_s16(c0, val, 0);
+  val = vld1_lane_s16(c1, val, 1);
+  val = vld1_lane_s16(c2, val, 2);
+  val = vld1_lane_s16(c3, val, 3);
+  return val;
+}
+
+static INLINE void iadst8_new_neon(int16x8_t *const in, int16x8_t *out,
+                                   int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
+                        (int16_t *)(cospi + 20), (int16_t *)(cospi + 44));
+  const int16x4_t c1 =
+      create_s16x4_neon((int16_t *)(cospi + 36), (int16_t *)(cospi + 28),
+                        (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
+  const int16x4_t c2 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  int16x8_t x[8];
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  // Stage 1
+  x[0] = in[7];
+  x[1] = in[0];
+  x[2] = in[5];
+  x[3] = in[2];
+  x[4] = in[3];
+  x[5] = in[4];
+  x[6] = in[1];
+  x[7] = in[6];
+
+  // Stage 2
+  btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
+  btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
+  btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
+  btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
+
+  // Stage 3
+  x[0] = vqaddq_s16(s0, s4);
+  x[1] = vqaddq_s16(s1, s5);
+  x[2] = vqaddq_s16(s2, s6);
+  x[3] = vqaddq_s16(s3, s7);
+  x[4] = vqsubq_s16(s0, s4);
+  x[5] = vqsubq_s16(s1, s5);
+  x[6] = vqsubq_s16(s2, s6);
+  x[7] = vqsubq_s16(s3, s7);
+
+  // Stage 4
+  s0 = x[0];
+  s1 = x[1];
+  s2 = x[2];
+  s3 = x[3];
+  btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
+  btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6);
+
+  // Stage 5
+  x[0] = vqaddq_s16(s0, s2);
+  x[1] = vqaddq_s16(s1, s3);
+  x[2] = vqsubq_s16(s0, s2);
+  x[3] = vqsubq_s16(s1, s3);
+  x[4] = vqaddq_s16(s4, s6);
+  x[5] = vqaddq_s16(s5, s7);
+  x[6] = vqsubq_s16(s4, s6);
+  x[7] = vqsubq_s16(s5, s7);
+
+  // stage 6
+  btf_16_half_neon(x + 2, c2);
+  btf_16_half_neon(x + 6, c2);
+
+  // Stage 7
+  out[0] = x[0];
+  out[1] = vnegq_s16(x[4]);
+  out[2] = x[6];
+  out[3] = vnegq_s16(x[2]);
+  out[4] = x[3];
+  out[5] = vnegq_s16(x[7]);
+  out[6] = x[5];
+  out[7] = vnegq_s16(x[1]);
+}
+
+static INLINE void iadst8_low1_new_neon(int16x8_t *const in, int16x8_t *out,
+                                        int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int16x4_t c2 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  int16x8_t x[8];
+  int16x8_t s0, s1, s4, s5;
+
+  // Stage 1
+  x[1] = in[0];
+
+  // Stage 2
+
+  btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1);
+
+  // Stage 3
+  x[0] = s0;
+  x[1] = s1;
+  x[4] = s0;
+  x[5] = s1;
+
+  // Stage 4
+  s0 = x[0];
+  s1 = x[1];
+  btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
+
+  // Stage 5
+  x[0] = s0;
+  x[1] = s1;
+  x[2] = s0;
+  x[3] = s1;
+  x[4] = s4;
+  x[5] = s5;
+  x[6] = s4;
+  x[7] = s5;
+
+  // stage 6
+  btf_16_half_neon(x + 2, c2);
+  btf_16_half_neon(x + 6, c2);
+
+  // Stage 7
+  out[0] = x[0];
+  out[1] = vnegq_s16(x[4]);
+  out[2] = x[6];
+  out[3] = vnegq_s16(x[2]);
+  out[4] = x[3];
+  out[5] = vnegq_s16(x[7]);
+  out[6] = x[5];
+  out[7] = vnegq_s16(x[1]);
+}
+
+static INLINE void idct8_new_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
+                                  int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[8], step2[8];
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c2 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  // stage 2
+  btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]);
+  btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]);
+
+  // stage 3
+  btf_16_lane_0_1_neon(in[0], in[4], c2, &step2[0], &step2[1]);
+  btf_16_lane_2_3_neon(in[2], in[6], c2, &step2[3], &step2[2]);
+  step2[4] = vqaddq_s16(step1[4], step1[5]);
+  step2[5] = vqsubq_s16(step1[4], step1[5]);
+  step2[6] = vqsubq_s16(step1[7], step1[6]);
+  step2[7] = vqaddq_s16(step1[7], step1[6]);
+
+  // stage 4
+  step1[0] = vqaddq_s16(step2[0], step2[3]);
+  step1[1] = vqaddq_s16(step2[1], step2[2]);
+  step1[2] = vqsubq_s16(step2[1], step2[2]);
+  step1[3] = vqsubq_s16(step2[0], step2[3]);
+  btf_16_lane_0_1_neon(step2[6], step2[5], c2, &step1[6], &step1[5]);
+
+  // stage 5
+  out[0] = vqaddq_s16(step1[0], step2[7]);
+  out[1] = vqaddq_s16(step1[1], step1[6]);
+  out[2] = vqaddq_s16(step1[2], step1[5]);
+  out[3] = vqaddq_s16(step1[3], step2[4]);
+  out[4] = vqsubq_s16(step1[3], step2[4]);
+  out[5] = vqsubq_s16(step1[2], step1[5]);
+  out[6] = vqsubq_s16(step1[1], step1[6]);
+  out[7] = vqsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8_low1_new_neon(int16x8_t *in, int16x8_t *out,
+                                       int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1;
+  int32x4_t t32[2];
+
+  // stage 1
+  // stage 2
+  // stage 3
+  t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]);
+
+  step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                       vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  // stage 4
+  // stage 5
+  out[0] = step1;
+  out[1] = step1;
+  out[2] = step1;
+  out[3] = step1;
+  out[4] = step1;
+  out[5] = step1;
+  out[6] = step1;
+  out[7] = step1;
+}
+
+void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
+  assert(!(size % 4));
+  if (!bit) return;
+  const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit));
+  for (int i = 0; i < size; i++) {
+    arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8);
+  }
+}
+
+static INLINE void flip_buf_ud_neon(int16x8_t *input, int size) {
+  int16x8_t temp[8];
+  for (int i = 0; i < size; ++i) {
+    temp[i] = input[size - 1 - i];
+  }
+  for (int i = 0; i < size; ++i) {
+    input[i] = temp[i];
+  }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input,
+                                                   int16x8_t *const a,
+                                                   int out_size) {
+  for (int i = 0; i < 8; ++i) {
+    a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)),
+                        vmovn_s32(vld1q_s32(input + 4)));
+    input += out_size;
+  }
+}
+
+static INLINE void identity8_new_neon(int16x8_t *input, int16x8_t *output,
+                                      int8_t cos_bit, int bit) {
+  (void)bit;
+  (void)cos_bit;
+
+  output[0] = vmulq_n_s16(input[0], (int16_t)2);
+  output[1] = vmulq_n_s16(input[1], (int16_t)2);
+  output[2] = vmulq_n_s16(input[2], (int16_t)2);
+  output[3] = vmulq_n_s16(input[3], (int16_t)2);
+  output[4] = vmulq_n_s16(input[4], (int16_t)2);
+  output[5] = vmulq_n_s16(input[5], (int16_t)2);
+  output[6] = vmulq_n_s16(input[6], (int16_t)2);
+  output[7] = vmulq_n_s16(input[7], (int16_t)2);
+}
+
+static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
+                                        int size) {
+  int32x4_t out_low, out_high;
+  int16x4_t low, high;
+
+  for (int z = 0; z < size; ++z) {
+    out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2);
+    out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2);
+
+    low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
+    high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
+
+    output[z] = vcombine_s16(low, high);
+  }
+}
+
+static INLINE void identity16_new_neon(int16x8_t *input, int16x8_t *output,
+                                       int8_t cos_bit, int bit) {
+  (void)bit;
+  (void)cos_bit;
+
+  int32x4_t out_low, out_high;
+  int16x4_t low, high;
+  int16_t scale = (int16_t)(2 * NewSqrt2);
+
+  for (int z = 0; z < 16; ++z) {
+    out_low = vmull_n_s16(vget_low_s16(input[z]), scale);
+    out_high = vmull_n_s16(vget_high_s16(input[z]), scale);
+
+    low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
+    high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
+
+    output[z] = vcombine_s16(low, high);
+  }
+}
+
+static INLINE void identity32_new_neon(int16x8_t *input, int16x8_t *output,
+                                       int8_t cos_bit, int bit) {
+  (void)bit;
+  (void)cos_bit;
+
+  for (int z = 0; z < 32; ++z) {
+    output[z] = vmulq_n_s16(input[z], (int16_t)4);
+  }
+}
+
+static INLINE void idct16_low1_new_neon(int16x8_t *in, int16x8_t *out,
+                                        int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1;
+  int32x4_t t32[2];
+
+  // stage 4
+
+  t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
+  step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                       vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  // stage 6
+  // stage 7
+  out[0] = step1;
+  out[1] = step1;
+  out[2] = step1;
+  out[3] = step1;
+  out[4] = step1;
+  out[5] = step1;
+  out[6] = step1;
+  out[7] = step1;
+  out[8] = step1;
+  out[9] = step1;
+  out[10] = step1;
+  out[11] = step1;
+  out[12] = step1;
+  out[13] = step1;
+  out[14] = step1;
+  out[15] = step1;
+}
+
+static INLINE void idct16_new_neon(int16x8_t *in, int16x8_t *out,
+                                   int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[16], step2[16];
+
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
+                        (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
+  const int16x4_t c1 =
+      create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
+                        (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
+  const int16x4_t c2 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c3 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  // stage 2
+
+  btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]);
+  btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]);
+  btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]);
+  btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]);
+
+  step2[0] = in[0];
+  step2[1] = in[8];
+  step2[2] = in[4];
+  step2[3] = in[12];
+  step2[4] = in[2];
+  step2[5] = in[10];
+  step2[6] = in[6];
+  step2[7] = in[14];
+
+  // stage 3
+
+  btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]);
+  btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]);
+
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+  step1[8] = vqaddq_s16(step2[8], step2[9]);
+  step1[9] = vqsubq_s16(step2[8], step2[9]);
+  step1[10] = vqsubq_s16(step2[11], step2[10]);
+  step1[11] = vqaddq_s16(step2[11], step2[10]);
+  step1[12] = vqaddq_s16(step2[12], step2[13]);
+  step1[13] = vqsubq_s16(step2[12], step2[13]);
+  step1[14] = vqsubq_s16(step2[15], step2[14]);
+  step1[15] = vqaddq_s16(step2[15], step2[14]);
+
+  // stage 4
+
+  btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]);
+  btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]);
+  btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c3,
+                       &step2[10], &step2[13]);
+
+  step2[4] = vqaddq_s16(step1[4], step1[5]);
+  step2[5] = vqsubq_s16(step1[4], step1[5]);
+  step2[6] = vqsubq_s16(step1[7], step1[6]);
+  step2[7] = vqaddq_s16(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+
+  btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[3]);
+  step1[1] = vqaddq_s16(step2[1], step2[2]);
+  step1[2] = vqsubq_s16(step2[1], step2[2]);
+  step1[3] = vqsubq_s16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  step1[7] = step2[7];
+  step1[8] = vqaddq_s16(step2[8], step2[11]);
+  step1[9] = vqaddq_s16(step2[9], step2[10]);
+  step1[10] = vqsubq_s16(step2[9], step2[10]);
+  step1[11] = vqsubq_s16(step2[8], step2[11]);
+  step1[12] = vqsubq_s16(step2[15], step2[12]);
+  step1[13] = vqsubq_s16(step2[14], step2[13]);
+  step1[14] = vqaddq_s16(step2[14], step2[13]);
+  step1[15] = vqaddq_s16(step2[15], step2[12]);
+
+  // stage 6
+
+  btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+  btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[7]);
+  step2[1] = vqaddq_s16(step1[1], step1[6]);
+  step2[2] = vqaddq_s16(step1[2], step1[5]);
+  step2[3] = vqaddq_s16(step1[3], step1[4]);
+  step2[4] = vqsubq_s16(step1[3], step1[4]);
+  step2[5] = vqsubq_s16(step1[2], step1[5]);
+  step2[6] = vqsubq_s16(step1[1], step1[6]);
+  step2[7] = vqsubq_s16(step1[0], step1[7]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  out[0] = vqaddq_s16(step2[0], step2[15]);
+  out[1] = vqaddq_s16(step2[1], step2[14]);
+  out[2] = vqaddq_s16(step2[2], step2[13]);
+  out[3] = vqaddq_s16(step2[3], step2[12]);
+  out[4] = vqaddq_s16(step2[4], step2[11]);
+  out[5] = vqaddq_s16(step2[5], step2[10]);
+  out[6] = vqaddq_s16(step2[6], step2[9]);
+  out[7] = vqaddq_s16(step2[7], step2[8]);
+  out[8] = vqsubq_s16(step2[7], step2[8]);
+  out[9] = vqsubq_s16(step2[6], step2[9]);
+  out[10] = vqsubq_s16(step2[5], step2[10]);
+  out[11] = vqsubq_s16(step2[4], step2[11]);
+  out[12] = vqsubq_s16(step2[3], step2[12]);
+  out[13] = vqsubq_s16(step2[2], step2[13]);
+  out[14] = vqsubq_s16(step2[1], step2[14]);
+  out[15] = vqsubq_s16(step2[0], step2[15]);
+}
+
+static INLINE void idct16_low8_new_neon(int16x8_t *in, int16x8_t *out,
+                                        int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[16], step2[16];
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  // stage 1
+  // stage 2
+
+  step2[0] = in[0];
+  step2[2] = in[4];
+  step2[4] = in[2];
+  step2[6] = in[6];
+
+  btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]);
+  btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]);
+  btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]);
+  btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]);
+
+  // stage 3
+
+  btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
+  btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
+
+  step1[0] = step2[0];
+  step1[2] = step2[2];
+  step1[8] = vqaddq_s16(step2[8], step2[9]);
+  step1[9] = vqsubq_s16(step2[8], step2[9]);
+  step1[10] = vqsubq_s16(step2[11], step2[10]);
+  step1[11] = vqaddq_s16(step2[11], step2[10]);
+  step1[12] = vqaddq_s16(step2[12], step2[13]);
+  step1[13] = vqsubq_s16(step2[12], step2[13]);
+  step1[14] = vqsubq_s16(step2[15], step2[14]);
+  step1[15] = vqaddq_s16(step2[15], step2[14]);
+
+  // stage 4
+
+  btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+  btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
+  btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c0,
+                       &step2[10], &step2[13]);
+
+  step2[4] = vqaddq_s16(step1[4], step1[5]);
+  step2[5] = vqsubq_s16(step1[4], step1[5]);
+  step2[6] = vqsubq_s16(step1[7], step1[6]);
+  step2[7] = vqaddq_s16(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+
+  btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]);
+  step1[0] = vqaddq_s16(step2[0], step2[3]);
+  step1[1] = vqaddq_s16(step2[1], step2[2]);
+  step1[2] = vqsubq_s16(step2[1], step2[2]);
+  step1[3] = vqsubq_s16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  step1[7] = step2[7];
+  step1[8] = vqaddq_s16(step2[8], step2[11]);
+  step1[9] = vqaddq_s16(step2[9], step2[10]);
+  step1[10] = vqsubq_s16(step2[9], step2[10]);
+  step1[11] = vqsubq_s16(step2[8], step2[11]);
+  step1[12] = vqsubq_s16(step2[15], step2[12]);
+  step1[13] = vqsubq_s16(step2[14], step2[13]);
+  step1[14] = vqaddq_s16(step2[14], step2[13]);
+  step1[15] = vqaddq_s16(step2[15], step2[12]);
+
+  // stage 6
+  btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]);
+  btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[7]);
+  step2[1] = vqaddq_s16(step1[1], step1[6]);
+  step2[2] = vqaddq_s16(step1[2], step1[5]);
+  step2[3] = vqaddq_s16(step1[3], step1[4]);
+  step2[4] = vqsubq_s16(step1[3], step1[4]);
+  step2[5] = vqsubq_s16(step1[2], step1[5]);
+  step2[6] = vqsubq_s16(step1[1], step1[6]);
+  step2[7] = vqsubq_s16(step1[0], step1[7]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+
+  out[0] = vqaddq_s16(step2[0], step2[15]);
+  out[1] = vqaddq_s16(step2[1], step2[14]);
+  out[2] = vqaddq_s16(step2[2], step2[13]);
+  out[3] = vqaddq_s16(step2[3], step2[12]);
+  out[4] = vqaddq_s16(step2[4], step2[11]);
+  out[5] = vqaddq_s16(step2[5], step2[10]);
+  out[6] = vqaddq_s16(step2[6], step2[9]);
+  out[7] = vqaddq_s16(step2[7], step2[8]);
+  out[8] = vqsubq_s16(step2[7], step2[8]);
+  out[9] = vqsubq_s16(step2[6], step2[9]);
+  out[10] = vqsubq_s16(step2[5], step2[10]);
+  out[11] = vqsubq_s16(step2[4], step2[11]);
+  out[12] = vqsubq_s16(step2[3], step2[12]);
+  out[13] = vqsubq_s16(step2[2], step2[13]);
+  out[14] = vqsubq_s16(step2[1], step2[14]);
+  out[15] = vqsubq_s16(step2[0], step2[15]);
+}
+
+static INLINE void iadst16_new_neon(int16x8_t *const in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
+                        (int16_t *)(cospi + 10), (int16_t *)(cospi + 54));
+  const int16x4_t c1 =
+      create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
+                        (int16_t *)(cospi + 26), (int16_t *)(cospi + 38));
+  const int16x4_t c2 =
+      create_s16x4_neon((int16_t *)(cospi + 34), (int16_t *)(cospi + 30),
+                        (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
+  const int16x4_t c3 =
+      create_s16x4_neon((int16_t *)(cospi + 50), (int16_t *)(cospi + 14),
+                        (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
+  const int16x4_t c4 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+
+  const int16x4_t c =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  int16x8_t x[16];
+  int16x8_t t[14];
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+  int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
+
+  // Stage 1
+  x[0] = in[15];
+  x[1] = in[0];
+  x[2] = in[13];
+  x[3] = in[2];
+  x[4] = in[11];
+  x[5] = in[4];
+  x[6] = in[9];
+  x[7] = in[6];
+  x[8] = in[7];
+  x[9] = in[8];
+  x[10] = in[5];
+  x[11] = in[10];
+  x[12] = in[3];
+  x[13] = in[12];
+  x[14] = in[1];
+  x[15] = in[14];
+
+  // Stage 2
+  btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
+  btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
+  btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
+  btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
+  btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9);
+  btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11);
+  btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13);
+  btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15);
+
+  // Stage 3
+  x[0] = vqaddq_s16(s0, s8);
+  x[1] = vqaddq_s16(s1, s9);
+  x[2] = vqaddq_s16(s2, s10);
+  x[3] = vqaddq_s16(s3, s11);
+  x[4] = vqaddq_s16(s4, s12);
+  x[5] = vqaddq_s16(s5, s13);
+  x[6] = vqaddq_s16(s6, s14);
+  x[7] = vqaddq_s16(s7, s15);
+  x[8] = vqsubq_s16(s0, s8);
+  x[9] = vqsubq_s16(s1, s9);
+  x[10] = vqsubq_s16(s2, s10);
+  x[11] = vqsubq_s16(s3, s11);
+  x[12] = vqsubq_s16(s4, s12);
+  x[13] = vqsubq_s16(s5, s13);
+  x[14] = vqsubq_s16(s6, s14);
+  x[15] = vqsubq_s16(s7, s15);
+
+  // Stage 4
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  t[4] = x[4];
+  t[5] = x[5];
+  t[6] = x[6];
+  t[7] = x[7];
+  btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+  btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
+  btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
+  btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
+
+  // Stage 5
+  x[0] = vqaddq_s16(t[0], t[4]);
+  x[1] = vqaddq_s16(t[1], t[5]);
+  x[2] = vqaddq_s16(t[2], t[6]);
+  x[3] = vqaddq_s16(t[3], t[7]);
+  x[4] = vqsubq_s16(t[0], t[4]);
+  x[5] = vqsubq_s16(t[1], t[5]);
+  x[6] = vqsubq_s16(t[2], t[6]);
+  x[7] = vqsubq_s16(t[3], t[7]);
+  x[8] = vqaddq_s16(s8, s12);
+  x[9] = vqaddq_s16(s9, s13);
+  x[10] = vqaddq_s16(s10, s14);
+  x[11] = vqaddq_s16(s11, s15);
+  x[12] = vqsubq_s16(s8, s12);
+  x[13] = vqsubq_s16(s9, s13);
+  x[14] = vqsubq_s16(s10, s14);
+  x[15] = vqsubq_s16(s11, s15);
+
+  // stage 6
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+  btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
+  t[8] = x[8];
+  t[9] = x[9];
+  t[10] = x[10];
+  t[11] = x[11];
+  btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+  btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
+
+  // Stage 7
+  x[0] = vqaddq_s16(t[0], t[2]);
+  x[1] = vqaddq_s16(t[1], t[3]);
+  x[2] = vqsubq_s16(t[0], t[2]);
+  x[3] = vqsubq_s16(t[1], t[3]);
+  x[4] = vqaddq_s16(s4, s6);
+  x[5] = vqaddq_s16(s5, s7);
+  x[6] = vqsubq_s16(s4, s6);
+  x[7] = vqsubq_s16(s5, s7);
+  x[8] = vqaddq_s16(t[8], t[10]);
+  x[9] = vqaddq_s16(t[9], t[11]);
+  x[10] = vqsubq_s16(t[8], t[10]);
+  x[11] = vqsubq_s16(t[9], t[11]);
+  x[12] = vqaddq_s16(s12, s14);
+  x[13] = vqaddq_s16(s13, s15);
+  x[14] = vqsubq_s16(s12, s14);
+  x[15] = vqsubq_s16(s13, s15);
+
+  // Stage 8
+  btf_16_half_neon(x + 2, c);
+  btf_16_half_neon(x + 6, c);
+  btf_16_half_neon(x + 10, c);
+  btf_16_half_neon(x + 14, c);
+
+  // Stage 9
+  out[0] = x[0];
+  out[1] = vnegq_s16(x[8]);
+  out[2] = x[12];
+  out[3] = vnegq_s16(x[4]);
+  out[4] = x[6];
+  out[5] = vnegq_s16(x[14]);
+  out[6] = x[10];
+  out[7] = vnegq_s16(x[2]);
+  out[8] = x[3];
+  out[9] = vnegq_s16(x[11]);
+  out[10] = x[15];
+  out[11] = vnegq_s16(x[7]);
+  out[12] = x[5];
+  out[13] = vnegq_s16(x[13]);
+  out[14] = x[9];
+  out[15] = vnegq_s16(x[1]);
+}
+
+static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out,
+                                         int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int16x4_t c4 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  int16x8_t x[16];
+  int16x8_t t[10];
+  int16x8_t s0, s1, s4, s5;
+  int16x8_t s8, s9, s12, s13;
+
+  // Stage 1
+  x[1] = in[0];
+
+  // Stage 2
+  btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
+
+  // Stage 3
+  x[0] = s0;
+  x[1] = s1;
+  x[8] = s0;
+  x[9] = s1;
+
+  // Stage 4
+  t[0] = x[0];
+  t[1] = x[1];
+  btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+
+  // Stage 5
+  x[0] = t[0];
+  x[1] = t[1];
+  x[4] = t[0];
+  x[5] = t[1];
+  x[8] = s8;
+  x[9] = s9;
+  x[12] = s8;
+  x[13] = s9;
+
+  // stage 6
+  t[0] = x[0];
+  t[1] = x[1];
+  btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+  t[8] = x[8];
+  t[9] = x[9];
+  btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+
+  // Stage 7
+  x[0] = t[0];
+  x[1] = t[1];
+  x[2] = t[0];
+  x[3] = t[1];
+  x[4] = s4;
+  x[5] = s5;
+  x[6] = s4;
+  x[7] = s5;
+  x[8] = t[8];
+  x[9] = t[9];
+  x[10] = t[8];
+  x[11] = t[9];
+  x[12] = s12;
+  x[13] = s13;
+  x[14] = s12;
+  x[15] = s13;
+
+  // Stage 8
+  btf_16_half_neon(x + 2, c);
+  btf_16_half_neon(x + 6, c);
+  btf_16_half_neon(x + 10, c);
+  btf_16_half_neon(x + 14, c);
+
+  // Stage 9
+  out[0] = x[0];
+  out[1] = vnegq_s16(x[8]);
+  out[2] = x[12];
+  out[3] = vnegq_s16(x[4]);
+  out[4] = x[6];
+  out[5] = vnegq_s16(x[14]);
+  out[6] = x[10];
+  out[7] = vnegq_s16(x[2]);
+  out[8] = x[3];
+  out[9] = vnegq_s16(x[11]);
+  out[10] = x[15];
+  out[11] = vnegq_s16(x[7]);
+  out[12] = x[5];
+  out[13] = vnegq_s16(x[13]);
+  out[14] = x[9];
+  out[15] = vnegq_s16(x[1]);
+}
+
+static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out,
+                                         int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  const int16x4_t c4 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  int16x8_t x[16];
+  int16x8_t t[14];
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+  int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
+
+  // Stage 1
+  x[1] = in[0];
+  x[3] = in[2];
+  x[5] = in[4];
+  x[7] = in[6];
+  x[8] = in[7];
+  x[10] = in[5];
+  x[12] = in[3];
+  x[14] = in[1];
+
+  // Stage 2
+  btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
+  btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3);
+  btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5);
+  btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7);
+
+  btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9);
+  btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11);
+  btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13);
+  btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15);
+
+  // Stage 3
+  x[0] = vqaddq_s16(s0, s8);
+  x[1] = vqaddq_s16(s1, s9);
+  x[2] = vqaddq_s16(s2, s10);
+  x[3] = vqaddq_s16(s3, s11);
+  x[4] = vqaddq_s16(s4, s12);
+  x[5] = vqaddq_s16(s5, s13);
+  x[6] = vqaddq_s16(s6, s14);
+  x[7] = vqaddq_s16(s7, s15);
+  x[8] = vqsubq_s16(s0, s8);
+  x[9] = vqsubq_s16(s1, s9);
+  x[10] = vqsubq_s16(s2, s10);
+  x[11] = vqsubq_s16(s3, s11);
+  x[12] = vqsubq_s16(s4, s12);
+  x[13] = vqsubq_s16(s5, s13);
+  x[14] = vqsubq_s16(s6, s14);
+  x[15] = vqsubq_s16(s7, s15);
+
+  // Stage 4
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  t[4] = x[4];
+  t[5] = x[5];
+  t[6] = x[6];
+  t[7] = x[7];
+  btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+  btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
+  btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
+  btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
+
+  // Stage 5
+  x[0] = vqaddq_s16(t[0], t[4]);
+  x[1] = vqaddq_s16(t[1], t[5]);
+  x[2] = vqaddq_s16(t[2], t[6]);
+  x[3] = vqaddq_s16(t[3], t[7]);
+  x[4] = vqsubq_s16(t[0], t[4]);
+  x[5] = vqsubq_s16(t[1], t[5]);
+  x[6] = vqsubq_s16(t[2], t[6]);
+  x[7] = vqsubq_s16(t[3], t[7]);
+  x[8] = vqaddq_s16(s8, s12);
+  x[9] = vqaddq_s16(s9, s13);
+  x[10] = vqaddq_s16(s10, s14);
+  x[11] = vqaddq_s16(s11, s15);
+  x[12] = vqsubq_s16(s8, s12);
+  x[13] = vqsubq_s16(s9, s13);
+  x[14] = vqsubq_s16(s10, s14);
+  x[15] = vqsubq_s16(s11, s15);
+
+  // stage 6
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+  btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
+  t[8] = x[8];
+  t[9] = x[9];
+  t[10] = x[10];
+  t[11] = x[11];
+  btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+  btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
+
+  // Stage 7
+  x[0] = vqaddq_s16(t[0], t[2]);
+  x[1] = vqaddq_s16(t[1], t[3]);
+  x[2] = vqsubq_s16(t[0], t[2]);
+  x[3] = vqsubq_s16(t[1], t[3]);
+  x[4] = vqaddq_s16(s4, s6);
+  x[5] = vqaddq_s16(s5, s7);
+  x[6] = vqsubq_s16(s4, s6);
+  x[7] = vqsubq_s16(s5, s7);
+  x[8] = vqaddq_s16(t[8], t[10]);
+  x[9] = vqaddq_s16(t[9], t[11]);
+  x[10] = vqsubq_s16(t[8], t[10]);
+  x[11] = vqsubq_s16(t[9], t[11]);
+  x[12] = vqaddq_s16(s12, s14);
+  x[13] = vqaddq_s16(s13, s15);
+  x[14] = vqsubq_s16(s12, s14);
+  x[15] = vqsubq_s16(s13, s15);
+
+  // Stage 8
+  btf_16_half_neon(x + 2, c);
+  btf_16_half_neon(x + 6, c);
+  btf_16_half_neon(x + 10, c);
+  btf_16_half_neon(x + 14, c);
+
+  // Stage 9
+  out[0] = x[0];
+  out[1] = vnegq_s16(x[8]);
+  out[2] = x[12];
+  out[3] = vnegq_s16(x[4]);
+  out[4] = x[6];
+  out[5] = vnegq_s16(x[14]);
+  out[6] = x[10];
+  out[7] = vnegq_s16(x[2]);
+  out[8] = x[3];
+  out[9] = vnegq_s16(x[11]);
+  out[10] = x[15];
+  out[11] = vnegq_s16(x[7]);
+  out[12] = x[5];
+  out[13] = vnegq_s16(x[13]);
+  out[14] = x[9];
+  out[15] = vnegq_s16(x[1]);
+}
+
+static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out,
+                                   int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[32], step2[32];
+
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
+                        (int16_t *)(cospi + 34), (int16_t *)(cospi + 30));
+  const int16x4_t c1 =
+      create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
+                        (int16_t *)(cospi + 50), (int16_t *)(cospi + 14));
+  const int16x4_t c2 =
+      create_s16x4_neon((int16_t *)(cospi + 10), (int16_t *)(cospi + 54),
+                        (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
+  const int16x4_t c3 =
+      create_s16x4_neon((int16_t *)(cospi + 26), (int16_t *)(cospi + 38),
+                        (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
+  const int16x4_t c4 =
+      create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
+                        (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
+  const int16x4_t c5 =
+      create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
+                        (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
+  const int16x4_t c6 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c7 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  // stage 2
+
+  btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]);
+  btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]);
+  btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]);
+  btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]);
+  btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]);
+  btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]);
+  btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]);
+  btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]);
+
+  step2[0] = in[0];
+  step2[1] = in[16];
+  step2[2] = in[8];
+  step2[3] = in[24];
+  step2[4] = in[4];
+  step2[5] = in[20];
+  step2[6] = in[12];
+  step2[7] = in[28];
+  step2[8] = in[2];
+  step2[9] = in[18];
+  step2[10] = in[10];
+  step2[11] = in[26];
+  step2[12] = in[6];
+  step2[13] = in[22];
+  step2[14] = in[14];
+  step2[15] = in[30];
+
+  // stage 3
+
+  btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]);
+  btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]);
+  btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]);
+  btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]);
+
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+  step1[4] = step2[4];
+  step1[5] = step2[5];
+  step1[6] = step2[6];
+  step1[7] = step2[7];
+
+  step1[16] = vqaddq_s16(step2[16], step2[17]);
+  step1[17] = vqsubq_s16(step2[16], step2[17]);
+  step1[18] = vqsubq_s16(step2[19], step2[18]);
+  step1[19] = vqaddq_s16(step2[19], step2[18]);
+  step1[20] = vqaddq_s16(step2[20], step2[21]);
+  step1[21] = vqsubq_s16(step2[20], step2[21]);
+  step1[22] = vqsubq_s16(step2[23], step2[22]);
+  step1[23] = vqaddq_s16(step2[23], step2[22]);
+  step1[24] = vqaddq_s16(step2[24], step2[25]);
+  step1[25] = vqsubq_s16(step2[24], step2[25]);
+  step1[26] = vqsubq_s16(step2[27], step2[26]);
+  step1[27] = vqaddq_s16(step2[27], step2[26]);
+  step1[28] = vqaddq_s16(step2[28], step2[29]);
+  step1[29] = vqsubq_s16(step2[28], step2[29]);
+  step1[30] = vqsubq_s16(step2[31], step2[30]);
+  step1[31] = vqaddq_s16(step2[31], step2[30]);
+
+  // stage 4
+
+  btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]);
+  btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]);
+  btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]);
+  btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c6,
+                       &step2[18], &step2[29]);
+  btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c6,
+                       &step2[22], &step2[25]);
+
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[8] = vqaddq_s16(step1[8], step1[9]);
+  step2[9] = vqsubq_s16(step1[8], step1[9]);
+  step2[10] = vqsubq_s16(step1[11], step1[10]);
+  step2[11] = vqaddq_s16(step1[11], step1[10]);
+  step2[12] = vqaddq_s16(step1[12], step1[13]);
+  step2[13] = vqsubq_s16(step1[12], step1[13]);
+  step2[14] = vqsubq_s16(step1[15], step1[14]);
+  step2[15] = vqaddq_s16(step1[15], step1[14]);
+  step2[16] = step1[16];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[31] = step1[31];
+
+  // stage 5
+
+  btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]);
+  btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]);
+  btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]);
+  btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c7,
+                       &step1[10], &step1[13]);
+
+  step1[4] = vqaddq_s16(step2[4], step2[5]);
+  step1[5] = vqsubq_s16(step2[4], step2[5]);
+  step1[6] = vqsubq_s16(step2[7], step2[6]);
+  step1[7] = vqaddq_s16(step2[7], step2[6]);
+  step1[8] = step2[8];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[19]);
+  step1[17] = vqaddq_s16(step2[17], step2[18]);
+  step1[18] = vqsubq_s16(step2[17], step2[18]);
+  step1[19] = vqsubq_s16(step2[16], step2[19]);
+  step1[20] = vqsubq_s16(step2[23], step2[20]);
+  step1[21] = vqsubq_s16(step2[22], step2[21]);
+  step1[22] = vqaddq_s16(step2[22], step2[21]);
+  step1[23] = vqaddq_s16(step2[23], step2[20]);
+  step1[24] = vqaddq_s16(step2[24], step2[27]);
+  step1[25] = vqaddq_s16(step2[25], step2[26]);
+  step1[26] = vqsubq_s16(step2[25], step2[26]);
+  step1[27] = vqsubq_s16(step2[24], step2[27]);
+  step1[28] = vqsubq_s16(step2[31], step2[28]);
+  step1[29] = vqsubq_s16(step2[30], step2[29]);
+  step1[30] = vqaddq_s16(step2[30], step2[29]);
+  step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+  // stage 6
+
+  btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]);
+  btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]);
+  btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c7,
+                       &step2[20], &step2[27]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c7,
+                       &step2[21], &step2[26]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[3]);
+  step2[1] = vqaddq_s16(step1[1], step1[2]);
+  step2[2] = vqsubq_s16(step1[1], step1[2]);
+  step2[3] = vqsubq_s16(step1[0], step1[3]);
+  step2[4] = step1[4];
+  step2[7] = step1[7];
+  step2[8] = vqaddq_s16(step1[8], step1[11]);
+  step2[9] = vqaddq_s16(step1[9], step1[10]);
+  step2[10] = vqsubq_s16(step1[9], step1[10]);
+  step2[11] = vqsubq_s16(step1[8], step1[11]);
+  step2[12] = vqsubq_s16(step1[15], step1[12]);
+  step2[13] = vqsubq_s16(step1[14], step1[13]);
+  step2[14] = vqaddq_s16(step1[14], step1[13]);
+  step2[15] = vqaddq_s16(step1[15], step1[12]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[22] = step1[22];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[25] = step1[25];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 7
+
+  btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]);
+  btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[7]);
+  step1[1] = vqaddq_s16(step2[1], step2[6]);
+  step1[2] = vqaddq_s16(step2[2], step2[5]);
+  step1[3] = vqaddq_s16(step2[3], step2[4]);
+  step1[4] = vqsubq_s16(step2[3], step2[4]);
+  step1[5] = vqsubq_s16(step2[2], step2[5]);
+  step1[6] = vqsubq_s16(step2[1], step2[6]);
+  step1[7] = vqsubq_s16(step2[0], step2[7]);
+  step1[8] = step2[8];
+  step1[9] = step2[9];
+  step1[14] = step2[14];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[23]);
+  step1[17] = vqaddq_s16(step2[17], step2[22]);
+  step1[18] = vqaddq_s16(step2[18], step2[21]);
+  step1[19] = vqaddq_s16(step2[19], step2[20]);
+  step1[20] = vqsubq_s16(step2[19], step2[20]);
+  step1[21] = vqsubq_s16(step2[18], step2[21]);
+  step1[22] = vqsubq_s16(step2[17], step2[22]);
+  step1[23] = vqsubq_s16(step2[16], step2[23]);
+  step1[24] = vqsubq_s16(step2[31], step2[24]);
+  step1[25] = vqsubq_s16(step2[30], step2[25]);
+  step1[26] = vqsubq_s16(step2[29], step2[26]);
+  step1[27] = vqsubq_s16(step2[28], step2[27]);
+  step1[28] = vqaddq_s16(step2[27], step2[28]);
+  step1[29] = vqaddq_s16(step2[26], step2[29]);
+  step1[30] = vqaddq_s16(step2[25], step2[30]);
+  step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+  // stage 8
+
+  btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]);
+  btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]);
+  btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]);
+  btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[15]);
+  step2[1] = vqaddq_s16(step1[1], step1[14]);
+  step2[2] = vqaddq_s16(step1[2], step1[13]);
+  step2[3] = vqaddq_s16(step1[3], step1[12]);
+  step2[4] = vqaddq_s16(step1[4], step1[11]);
+  step2[5] = vqaddq_s16(step1[5], step1[10]);
+  step2[6] = vqaddq_s16(step1[6], step1[9]);
+  step2[7] = vqaddq_s16(step1[7], step1[8]);
+  step2[8] = vqsubq_s16(step1[7], step1[8]);
+  step2[9] = vqsubq_s16(step1[6], step1[9]);
+  step2[10] = vqsubq_s16(step1[5], step1[10]);
+  step2[11] = vqsubq_s16(step1[4], step1[11]);
+  step2[12] = vqsubq_s16(step1[3], step1[12]);
+  step2[13] = vqsubq_s16(step1[2], step1[13]);
+  step2[14] = vqsubq_s16(step1[1], step1[14]);
+  step2[15] = vqsubq_s16(step1[0], step1[15]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[18] = step1[18];
+  step2[19] = step1[19];
+  step2[28] = step1[28];
+  step2[29] = step1[29];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 9
+
+  out[0] = vqaddq_s16(step2[0], step2[31]);
+  out[1] = vqaddq_s16(step2[1], step2[30]);
+  out[2] = vqaddq_s16(step2[2], step2[29]);
+  out[3] = vqaddq_s16(step2[3], step2[28]);
+  out[4] = vqaddq_s16(step2[4], step2[27]);
+  out[5] = vqaddq_s16(step2[5], step2[26]);
+  out[6] = vqaddq_s16(step2[6], step2[25]);
+  out[7] = vqaddq_s16(step2[7], step2[24]);
+  out[8] = vqaddq_s16(step2[8], step2[23]);
+  out[9] = vqaddq_s16(step2[9], step2[22]);
+  out[10] = vqaddq_s16(step2[10], step2[21]);
+  out[11] = vqaddq_s16(step2[11], step2[20]);
+  out[12] = vqaddq_s16(step2[12], step2[19]);
+  out[13] = vqaddq_s16(step2[13], step2[18]);
+  out[14] = vqaddq_s16(step2[14], step2[17]);
+  out[15] = vqaddq_s16(step2[15], step2[16]);
+  out[16] = vqsubq_s16(step2[15], step2[16]);
+  out[17] = vqsubq_s16(step2[14], step2[17]);
+  out[18] = vqsubq_s16(step2[13], step2[18]);
+  out[19] = vqsubq_s16(step2[12], step2[19]);
+  out[20] = vqsubq_s16(step2[11], step2[20]);
+  out[21] = vqsubq_s16(step2[10], step2[21]);
+  out[22] = vqsubq_s16(step2[9], step2[22]);
+  out[23] = vqsubq_s16(step2[8], step2[23]);
+  out[24] = vqsubq_s16(step2[7], step2[24]);
+  out[25] = vqsubq_s16(step2[6], step2[25]);
+  out[26] = vqsubq_s16(step2[5], step2[26]);
+  out[27] = vqsubq_s16(step2[4], step2[27]);
+  out[28] = vqsubq_s16(step2[3], step2[28]);
+  out[29] = vqsubq_s16(step2[2], step2[29]);
+  out[30] = vqsubq_s16(step2[1], step2[30]);
+  out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
+static INLINE void idct32_low1_new_neon(int16x8_t *in, int16x8_t *out,
+                                        int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1;
+  int32x4_t t32[2];
+
+  // stage 1
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+
+  t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
+  step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                       vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+
+  out[0] = step1;
+  out[1] = step1;
+  out[2] = step1;
+  out[3] = step1;
+  out[4] = step1;
+  out[5] = step1;
+  out[6] = step1;
+  out[7] = step1;
+  out[8] = step1;
+  out[9] = step1;
+  out[10] = step1;
+  out[11] = step1;
+  out[12] = step1;
+  out[13] = step1;
+  out[14] = step1;
+  out[15] = step1;
+  out[16] = step1;
+  out[17] = step1;
+  out[18] = step1;
+  out[19] = step1;
+  out[20] = step1;
+  out[21] = step1;
+  out[22] = step1;
+  out[23] = step1;
+  out[24] = step1;
+  out[25] = step1;
+  out[26] = step1;
+  out[27] = step1;
+  out[28] = step1;
+  out[29] = step1;
+  out[30] = step1;
+  out[31] = step1;
+}
+
+static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out,
+                                        int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[32], step2[32];
+  int32x4_t t32[16];
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c1 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  // stage 1
+  // stage 2
+
+  step2[0] = in[0];
+  step2[4] = in[4];
+  step2[8] = in[2];
+  step2[12] = in[6];
+
+  btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
+  btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
+  btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
+  btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[4] = step2[4];
+
+  btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
+  btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
+
+  step1[16] = step2[16];
+  step1[17] = step2[16];
+  step1[18] = step2[19];
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  step1[21] = step2[20];
+  step1[22] = step2[23];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[24];
+  step1[26] = step2[27];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+  step1[29] = step2[28];
+  step1[30] = step2[31];
+  step1[31] = step2[31];
+
+  // stage 4
+
+  btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
+  btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
+  btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
+                       &step2[18], &step2[29]);
+  btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
+                       &step2[22], &step2[25]);
+
+  step2[0] = step1[0];
+  step2[8] = step1[8];
+  step2[9] = step1[8];
+  step2[10] = step1[11];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[13] = step1[12];
+  step2[14] = step1[15];
+  step2[15] = step1[15];
+  step2[16] = step1[16];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[31] = step1[31];
+
+  // stage 5
+
+  t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
+  step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                          vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
+  btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
+                       &step1[10], &step1[13]);
+
+  step1[4] = step2[4];
+  step1[5] = step2[4];
+  step1[6] = step2[7];
+  step1[7] = step2[7];
+  step1[8] = step2[8];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[19]);
+  step1[17] = vqaddq_s16(step2[17], step2[18]);
+  step1[18] = vqsubq_s16(step2[17], step2[18]);
+  step1[19] = vqsubq_s16(step2[16], step2[19]);
+  step1[20] = vqsubq_s16(step2[23], step2[20]);
+  step1[21] = vqsubq_s16(step2[22], step2[21]);
+  step1[22] = vqaddq_s16(step2[22], step2[21]);
+  step1[23] = vqaddq_s16(step2[23], step2[20]);
+  step1[24] = vqaddq_s16(step2[24], step2[27]);
+  step1[25] = vqaddq_s16(step2[25], step2[26]);
+  step1[26] = vqsubq_s16(step2[25], step2[26]);
+  step1[27] = vqsubq_s16(step2[24], step2[27]);
+  step1[28] = vqsubq_s16(step2[31], step2[28]);
+  step1[29] = vqsubq_s16(step2[30], step2[29]);
+  step1[30] = vqaddq_s16(step2[30], step2[29]);
+  step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+  // stage 6
+
+  btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
+  btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
+  btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
+                       &step2[20], &step2[27]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
+                       &step2[21], &step2[26]);
+
+  step2[0] = step1[0];
+  step2[1] = step1[0];
+  step2[2] = step1[0];
+  step2[3] = step1[0];
+  step2[4] = step1[4];
+  step2[7] = step1[7];
+  step2[8] = vqaddq_s16(step1[8], step1[11]);
+  step2[9] = vqaddq_s16(step1[9], step1[10]);
+  step2[10] = vqsubq_s16(step1[9], step1[10]);
+  step2[11] = vqsubq_s16(step1[8], step1[11]);
+  step2[12] = vqsubq_s16(step1[15], step1[12]);
+  step2[13] = vqsubq_s16(step1[14], step1[13]);
+  step2[14] = vqaddq_s16(step1[14], step1[13]);
+  step2[15] = vqaddq_s16(step1[15], step1[12]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[22] = step1[22];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[25] = step1[25];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 7
+
+  btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
+  btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[7]);
+  step1[1] = vqaddq_s16(step2[1], step2[6]);
+  step1[2] = vqaddq_s16(step2[2], step2[5]);
+  step1[3] = vqaddq_s16(step2[3], step2[4]);
+  step1[4] = vqsubq_s16(step2[3], step2[4]);
+  step1[5] = vqsubq_s16(step2[2], step2[5]);
+  step1[6] = vqsubq_s16(step2[1], step2[6]);
+  step1[7] = vqsubq_s16(step2[0], step2[7]);
+  step1[8] = step2[8];
+  step1[9] = step2[9];
+  step1[14] = step2[14];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[23]);
+  step1[17] = vqaddq_s16(step2[17], step2[22]);
+  step1[18] = vqaddq_s16(step2[18], step2[21]);
+  step1[19] = vqaddq_s16(step2[19], step2[20]);
+  step1[20] = vqsubq_s16(step2[19], step2[20]);
+  step1[21] = vqsubq_s16(step2[18], step2[21]);
+  step1[22] = vqsubq_s16(step2[17], step2[22]);
+  step1[23] = vqsubq_s16(step2[16], step2[23]);
+  step1[24] = vqsubq_s16(step2[31], step2[24]);
+  step1[25] = vqsubq_s16(step2[30], step2[25]);
+  step1[26] = vqsubq_s16(step2[29], step2[26]);
+  step1[27] = vqsubq_s16(step2[28], step2[27]);
+  step1[28] = vqaddq_s16(step2[27], step2[28]);
+  step1[29] = vqaddq_s16(step2[26], step2[29]);
+  step1[30] = vqaddq_s16(step2[25], step2[30]);
+  step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+  // stage 8
+
+  btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
+  btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
+  btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
+  btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[15]);
+  step2[1] = vqaddq_s16(step1[1], step1[14]);
+  step2[2] = vqaddq_s16(step1[2], step1[13]);
+  step2[3] = vqaddq_s16(step1[3], step1[12]);
+  step2[4] = vqaddq_s16(step1[4], step1[11]);
+  step2[5] = vqaddq_s16(step1[5], step1[10]);
+  step2[6] = vqaddq_s16(step1[6], step1[9]);
+  step2[7] = vqaddq_s16(step1[7], step1[8]);
+  step2[8] = vqsubq_s16(step1[7], step1[8]);
+  step2[9] = vqsubq_s16(step1[6], step1[9]);
+  step2[10] = vqsubq_s16(step1[5], step1[10]);
+  step2[11] = vqsubq_s16(step1[4], step1[11]);
+  step2[12] = vqsubq_s16(step1[3], step1[12]);
+  step2[13] = vqsubq_s16(step1[2], step1[13]);
+  step2[14] = vqsubq_s16(step1[1], step1[14]);
+  step2[15] = vqsubq_s16(step1[0], step1[15]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[18] = step1[18];
+  step2[19] = step1[19];
+  step2[28] = step1[28];
+  step2[29] = step1[29];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 9
+
+  out[0] = vqaddq_s16(step2[0], step2[31]);
+  out[1] = vqaddq_s16(step2[1], step2[30]);
+  out[2] = vqaddq_s16(step2[2], step2[29]);
+  out[3] = vqaddq_s16(step2[3], step2[28]);
+  out[4] = vqaddq_s16(step2[4], step2[27]);
+  out[5] = vqaddq_s16(step2[5], step2[26]);
+  out[6] = vqaddq_s16(step2[6], step2[25]);
+  out[7] = vqaddq_s16(step2[7], step2[24]);
+  out[8] = vqaddq_s16(step2[8], step2[23]);
+  out[9] = vqaddq_s16(step2[9], step2[22]);
+  out[10] = vqaddq_s16(step2[10], step2[21]);
+  out[11] = vqaddq_s16(step2[11], step2[20]);
+  out[12] = vqaddq_s16(step2[12], step2[19]);
+  out[13] = vqaddq_s16(step2[13], step2[18]);
+  out[14] = vqaddq_s16(step2[14], step2[17]);
+  out[15] = vqaddq_s16(step2[15], step2[16]);
+  out[16] = vqsubq_s16(step2[15], step2[16]);
+  out[17] = vqsubq_s16(step2[14], step2[17]);
+  out[18] = vqsubq_s16(step2[13], step2[18]);
+  out[19] = vqsubq_s16(step2[12], step2[19]);
+  out[20] = vqsubq_s16(step2[11], step2[20]);
+  out[21] = vqsubq_s16(step2[10], step2[21]);
+  out[22] = vqsubq_s16(step2[9], step2[22]);
+  out[23] = vqsubq_s16(step2[8], step2[23]);
+  out[24] = vqsubq_s16(step2[7], step2[24]);
+  out[25] = vqsubq_s16(step2[6], step2[25]);
+  out[26] = vqsubq_s16(step2[5], step2[26]);
+  out[27] = vqsubq_s16(step2[4], step2[27]);
+  out[28] = vqsubq_s16(step2[3], step2[28]);
+  out[29] = vqsubq_s16(step2[2], step2[29]);
+  out[30] = vqsubq_s16(step2[1], step2[30]);
+  out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
+static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out,
+                                         int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[32], step2[32];
+  int32x4_t t32[16];
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c1 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  // stage 1
+  // stage 2
+
+  btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
+  btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]);
+  btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]);
+  btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
+  btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
+  btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]);
+  btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]);
+  btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
+
+  step2[0] = in[0];
+  step2[2] = in[8];
+  step2[4] = in[4];
+  step2[6] = in[12];
+  step2[8] = in[2];
+  step2[10] = in[10];
+  step2[12] = in[6];
+  step2[14] = in[14];
+
+  // stage 3
+
+  btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
+  btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]);
+  btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]);
+  btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
+
+  step1[0] = step2[0];
+  step1[2] = step2[2];
+  step1[4] = step2[4];
+  step1[6] = step2[6];
+  step1[16] = vqaddq_s16(step2[16], step2[17]);
+  step1[17] = vqsubq_s16(step2[16], step2[17]);
+  step1[18] = vqsubq_s16(step2[19], step2[18]);
+  step1[19] = vqaddq_s16(step2[19], step2[18]);
+  step1[20] = vqaddq_s16(step2[20], step2[21]);
+  step1[21] = vqsubq_s16(step2[20], step2[21]);
+  step1[22] = vqsubq_s16(step2[23], step2[22]);
+  step1[23] = vqaddq_s16(step2[23], step2[22]);
+  step1[24] = vqaddq_s16(step2[24], step2[25]);
+  step1[25] = vqsubq_s16(step2[24], step2[25]);
+  step1[26] = vqsubq_s16(step2[27], step2[26]);
+  step1[27] = vqaddq_s16(step2[27], step2[26]);
+  step1[28] = vqaddq_s16(step2[28], step2[29]);
+  step1[29] = vqsubq_s16(step2[28], step2[29]);
+  step1[30] = vqsubq_s16(step2[31], step2[30]);
+  step1[31] = vqaddq_s16(step2[31], step2[30]);
+
+  // stage 4
+
+  btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
+  btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]);
+  btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
+  btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
+                       &step2[18], &step2[29]);
+  btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
+                       &step2[22], &step2[25]);
+
+  step2[0] = step1[0];
+  step2[2] = step1[2];
+  step2[8] = vqaddq_s16(step1[8], step1[9]);
+  step2[9] = vqsubq_s16(step1[8], step1[9]);
+  step2[10] = vqsubq_s16(step1[11], step1[10]);
+  step2[11] = vqaddq_s16(step1[11], step1[10]);
+  step2[12] = vqaddq_s16(step1[12], step1[13]);
+  step2[13] = vqsubq_s16(step1[12], step1[13]);
+  step2[14] = vqsubq_s16(step1[15], step1[14]);
+  step2[15] = vqaddq_s16(step1[15], step1[14]);
+  step2[16] = step1[16];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[31] = step1[31];
+
+  // stage 5
+
+  t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
+
+  step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                          vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]);
+  btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
+  btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
+                       &step1[10], &step1[13]);
+
+  step1[4] = vqaddq_s16(step2[4], step2[5]);
+  step1[5] = vqsubq_s16(step2[4], step2[5]);
+  step1[6] = vqsubq_s16(step2[7], step2[6]);
+  step1[7] = vqaddq_s16(step2[7], step2[6]);
+  step1[8] = step2[8];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[19]);
+  step1[17] = vqaddq_s16(step2[17], step2[18]);
+  step1[18] = vqsubq_s16(step2[17], step2[18]);
+  step1[19] = vqsubq_s16(step2[16], step2[19]);
+  step1[20] = vqsubq_s16(step2[23], step2[20]);
+  step1[21] = vqsubq_s16(step2[22], step2[21]);
+  step1[22] = vqaddq_s16(step2[22], step2[21]);
+  step1[23] = vqaddq_s16(step2[23], step2[20]);
+  step1[24] = vqaddq_s16(step2[24], step2[27]);
+  step1[25] = vqaddq_s16(step2[25], step2[26]);
+  step1[26] = vqsubq_s16(step2[25], step2[26]);
+  step1[27] = vqsubq_s16(step2[24], step2[27]);
+  step1[28] = vqsubq_s16(step2[31], step2[28]);
+  step1[29] = vqsubq_s16(step2[30], step2[29]);
+  step1[30] = vqaddq_s16(step2[30], step2[29]);
+  step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+  // stage 6
+
+  btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
+  btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
+  btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
+                       &step2[20], &step2[27]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
+                       &step2[21], &step2[26]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[3]);
+  step2[1] = vqaddq_s16(step1[0], step1[2]);
+  step2[2] = vqsubq_s16(step1[0], step1[2]);
+  step2[3] = vqsubq_s16(step1[0], step1[3]);
+  step2[4] = step1[4];
+  step2[7] = step1[7];
+  step2[8] = vqaddq_s16(step1[8], step1[11]);
+  step2[9] = vqaddq_s16(step1[9], step1[10]);
+  step2[10] = vqsubq_s16(step1[9], step1[10]);
+  step2[11] = vqsubq_s16(step1[8], step1[11]);
+  step2[12] = vqsubq_s16(step1[15], step1[12]);
+  step2[13] = vqsubq_s16(step1[14], step1[13]);
+  step2[14] = vqaddq_s16(step1[14], step1[13]);
+  step2[15] = vqaddq_s16(step1[15], step1[12]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[22] = step1[22];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[25] = step1[25];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 7
+
+  btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
+  btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[7]);
+  step1[1] = vqaddq_s16(step2[1], step2[6]);
+  step1[2] = vqaddq_s16(step2[2], step2[5]);
+  step1[3] = vqaddq_s16(step2[3], step2[4]);
+  step1[4] = vqsubq_s16(step2[3], step2[4]);
+  step1[5] = vqsubq_s16(step2[2], step2[5]);
+  step1[6] = vqsubq_s16(step2[1], step2[6]);
+  step1[7] = vqsubq_s16(step2[0], step2[7]);
+  step1[8] = step2[8];
+  step1[9] = step2[9];
+  step1[14] = step2[14];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[23]);
+  step1[17] = vqaddq_s16(step2[17], step2[22]);
+  step1[18] = vqaddq_s16(step2[18], step2[21]);
+  step1[19] = vqaddq_s16(step2[19], step2[20]);
+  step1[20] = vqsubq_s16(step2[19], step2[20]);
+  step1[21] = vqsubq_s16(step2[18], step2[21]);
+  step1[22] = vqsubq_s16(step2[17], step2[22]);
+  step1[23] = vqsubq_s16(step2[16], step2[23]);
+  step1[24] = vqsubq_s16(step2[31], step2[24]);
+  step1[25] = vqsubq_s16(step2[30], step2[25]);
+  step1[26] = vqsubq_s16(step2[29], step2[26]);
+  step1[27] = vqsubq_s16(step2[28], step2[27]);
+  step1[28] = vqaddq_s16(step2[27], step2[28]);
+  step1[29] = vqaddq_s16(step2[26], step2[29]);
+  step1[30] = vqaddq_s16(step2[25], step2[30]);
+  step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+  // stage 8
+
+  btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
+  btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
+  btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
+  btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[15]);
+  step2[1] = vqaddq_s16(step1[1], step1[14]);
+  step2[2] = vqaddq_s16(step1[2], step1[13]);
+  step2[3] = vqaddq_s16(step1[3], step1[12]);
+  step2[4] = vqaddq_s16(step1[4], step1[11]);
+  step2[5] = vqaddq_s16(step1[5], step1[10]);
+  step2[6] = vqaddq_s16(step1[6], step1[9]);
+  step2[7] = vqaddq_s16(step1[7], step1[8]);
+  step2[8] = vqsubq_s16(step1[7], step1[8]);
+  step2[9] = vqsubq_s16(step1[6], step1[9]);
+  step2[10] = vqsubq_s16(step1[5], step1[10]);
+  step2[11] = vqsubq_s16(step1[4], step1[11]);
+  step2[12] = vqsubq_s16(step1[3], step1[12]);
+  step2[13] = vqsubq_s16(step1[2], step1[13]);
+  step2[14] = vqsubq_s16(step1[1], step1[14]);
+  step2[15] = vqsubq_s16(step1[0], step1[15]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[18] = step1[18];
+  step2[19] = step1[19];
+  step2[28] = step1[28];
+  step2[29] = step1[29];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 9
+
+  out[0] = vqaddq_s16(step2[0], step2[31]);
+  out[1] = vqaddq_s16(step2[1], step2[30]);
+  out[2] = vqaddq_s16(step2[2], step2[29]);
+  out[3] = vqaddq_s16(step2[3], step2[28]);
+  out[4] = vqaddq_s16(step2[4], step2[27]);
+  out[5] = vqaddq_s16(step2[5], step2[26]);
+  out[6] = vqaddq_s16(step2[6], step2[25]);
+  out[7] = vqaddq_s16(step2[7], step2[24]);
+  out[8] = vqaddq_s16(step2[8], step2[23]);
+  out[9] = vqaddq_s16(step2[9], step2[22]);
+  out[10] = vqaddq_s16(step2[10], step2[21]);
+  out[11] = vqaddq_s16(step2[11], step2[20]);
+  out[12] = vqaddq_s16(step2[12], step2[19]);
+  out[13] = vqaddq_s16(step2[13], step2[18]);
+  out[14] = vqaddq_s16(step2[14], step2[17]);
+  out[15] = vqaddq_s16(step2[15], step2[16]);
+  out[16] = vqsubq_s16(step2[15], step2[16]);
+  out[17] = vqsubq_s16(step2[14], step2[17]);
+  out[18] = vqsubq_s16(step2[13], step2[18]);
+  out[19] = vqsubq_s16(step2[12], step2[19]);
+  out[20] = vqsubq_s16(step2[11], step2[20]);
+  out[21] = vqsubq_s16(step2[10], step2[21]);
+  out[22] = vqsubq_s16(step2[9], step2[22]);
+  out[23] = vqsubq_s16(step2[8], step2[23]);
+  out[24] = vqsubq_s16(step2[7], step2[24]);
+  out[25] = vqsubq_s16(step2[6], step2[25]);
+  out[26] = vqsubq_s16(step2[5], step2[26]);
+  out[27] = vqsubq_s16(step2[4], step2[27]);
+  out[28] = vqsubq_s16(step2[3], step2[28]);
+  out[29] = vqsubq_s16(step2[2], step2[29]);
+  out[30] = vqsubq_s16(step2[1], step2[30]);
+  out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
 // Functions for blocks with eob at DC and within
 // topleft 8x8, 16x16, 32x32 corner
 static const transform_1d_neon
@@ -90,10 +2112,37 @@ static const transform_1d_neon
         { NULL, NULL, NULL, NULL },
         { NULL, NULL, NULL, NULL } }
     };
-static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
-                                                  uint8_t *output, int stride,
-                                                  TX_TYPE tx_type,
-                                                  TX_SIZE tx_size, int eob) {
+
+static const transform_neon
+    lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct8_low1_new_neon, idct8_new_neon, NULL, NULL },
+        { iadst8_low1_new_neon, iadst8_new_neon, NULL, NULL },
+        { identity8_new_neon, identity8_new_neon, NULL, NULL } },
+      {
+          { idct16_low1_new_neon, idct16_low8_new_neon, idct16_new_neon, NULL },
+          { iadst16_low1_new_neon, iadst16_low8_new_neon, iadst16_new_neon,
+            NULL },
+          { identity16_new_neon, identity16_new_neon, identity16_new_neon,
+            NULL },
+      },
+      { { idct32_low1_new_neon, idct32_low8_new_neon, idct32_low16_new_neon,
+          idct32_new_neon },
+        { NULL, NULL, NULL, NULL },
+        { identity32_new_neon, identity32_new_neon, identity32_new_neon,
+          identity32_new_neon } },
+      { { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+static INLINE void lowbd_inv_txfm2d_add_wxh_idtx_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
   DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
   int32_t *temp_in = txfm_buf;
 
@@ -160,7 +2209,79 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
   }
 }
 
-static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
+static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
+                                                  uint8_t *output, int stride,
+                                                  TX_TYPE tx_type,
+                                                  TX_SIZE tx_size, int eob) {
+  int16x8_t a[32 * 4];
+  int16x8_t b[32 * 4];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
+                               0);
+  lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
+                               0);
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const int32_t *input_1;
+  int temp_b = 0;
+  const transform_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    input_1 = input;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+      transpose_s16_8x8q(&a[k], &a[k]);
+      input_1 += 8;
+    }
+    input += (txfm_size_col * 8);
+    if (abs(rect_type) == 1) {
+      int y = i * txfm_size_col;
+      round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+    }
+    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+                                  -shift[0]);
+    for (int j = 0; j < buf_size_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+    }
+    temp_b += 8;
+  }
+  for (int j = 0; j < buf_size_w_div8; ++j) {
+    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+                                  -shift[1]);
+  }
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_add_flip_buffer_16xn_neon(
+          &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_wxh_identity_neon(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
   DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
@@ -244,7 +2365,88 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
   }
 }
 
-static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
+static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  int16x8_t a[16 * 2];
+  int16x8_t b[16 * 2];
+  int eobx, eoby, ud_flip, lr_flip;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
+                               0);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const int32_t *input_1;
+  int temp_b = 0;
+  const transform_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    input_1 = input;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+      transpose_s16_8x8q(&a[k], &a[k]);
+      input_1 += 8;
+    }
+    input += (txfm_size_col * 8);
+    if (abs(rect_type) == 1) {
+      int y = i * txfm_size_col;
+      round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+    }
+    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+                                  -shift[0]);
+    if (lr_flip == 1) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        int k = j * 8 + i * txfm_size_col;
+        flip_buf_ud_neon(&a[k], 8);
+        transpose_s16_8x8q(
+            &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
+      }
+      temp_b += 8;
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        int k = j * 8 + i * txfm_size_col;
+        transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+      }
+      temp_b += 8;
+    }
+  }
+  for (int j = 0; j < buf_size_w_div8; ++j) {
+    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+                                  -shift[1]);
+  }
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_add_flip_buffer_16xn_neon(
+          &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_wxh_identity_neon(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
   DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
@@ -328,6 +2530,78 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
   }
 }
 
+static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  int16x8_t a[16 * 2];
+  int16x8_t b[16 * 2];
+  int eobx, eoby, ud_flip, lr_flip;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
+                               0);
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const int32_t *input_1;
+  int temp_b = 0;
+  const transform_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    input_1 = input;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+      transpose_s16_8x8q(&a[k], &a[k]);
+      input_1 += 8;
+    }
+    input += (txfm_size_col * 8);
+    if (abs(rect_type) == 1) {
+      int y = i * txfm_size_col;
+      round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+    }
+    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+                                  -shift[0]);
+    for (int j = 0; j < buf_size_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+    }
+    temp_b += 8;
+  }
+  for (int j = 0; j < buf_size_w_div8; ++j) {
+    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+                                  -shift[1]);
+  }
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
+                                      output + 16 * i, stride, ud_flip,
+                                      txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
+  }
+}
+
 static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
                                                  uint8_t *output, int stride,
                                                  TX_TYPE tx_type,
@@ -644,7 +2918,7 @@ void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
   }
 }
 
-static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
+static INLINE void lowbd_inv_txfm2d_add_wxh_no_identity_neon(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
   DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
@@ -727,6 +3001,118 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
   }
 }
 
+static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  int16x8_t a[64 * 8];
+  int16x8_t b[64 * 8];
+  int eobx, eoby, ud_flip, lr_flip;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const int32_t *input_1;
+  int temp_b = 0;
+
+  const transform_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    input_1 = input;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+      transpose_s16_8x8q(&a[k], &a[k]);
+      input_1 += 8;
+    }
+    input += (txfm_size_col * 8);
+    if (abs(rect_type) == 1) {
+      int y = i * txfm_size_col;
+      round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+    }
+    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+                                  -shift[0]);
+    if (lr_flip == 1) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        int k = j * 8 + i * txfm_size_col;
+        flip_buf_ud_neon(&a[k], 8);
+        transpose_s16_8x8q(
+            &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
+      }
+      temp_b += 8;
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        int k = j * 8 + i * txfm_size_col;
+        transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+      }
+      temp_b += 8;
+    }
+  }
+  for (int j = 0; j < buf_size_w_div8; ++j) {
+    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+                                  -shift[1]);
+  }
+
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
+                                      output + 16 * i, stride, ud_flip,
+                                      txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_wxh_universe_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  switch (tx_type) {
+    case IDTX:
+      lowbd_inv_txfm2d_add_wxh_idtx_neon(input, output, stride, tx_type,
+                                         tx_size, eob);
+      break;
+
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      lowbd_inv_txfm2d_add_v_wxh_identity_neon(input, output, stride, tx_type,
+                                               tx_size, eob);
+      break;
+
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      lowbd_inv_txfm2d_add_h_wxh_identity_neon(input, output, stride, tx_type,
+                                               tx_size, eob);
+      break;
+
+    default:
+      lowbd_inv_txfm2d_add_wxh_no_identity_neon(input, output, stride, tx_type,
+                                                tx_size, eob);
+      break;
+  }
+}
+
 static INLINE void lowbd_inv_txfm2d_add_universe_neon(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
@@ -756,6 +3142,7 @@ static INLINE void lowbd_inv_txfm2d_add_universe_neon(
       break;
   }
 }
+
 void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
                                    int stride, TX_TYPE tx_type, TX_SIZE tx_size,
                                    int eob) {
@@ -787,8 +3174,8 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
       break;
 
     case TX_16X64: {
-      lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
-                                         tx_size, eob);
+      lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type,
+                                             tx_size, eob);
     } break;
 
     case TX_64X16: {
@@ -797,13 +3184,13 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
         memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
         memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
       }
-      lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
-                                         tx_size, eob);
+      lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
+                                             tx_size, eob);
     } break;
 
     case TX_32X64: {
-      lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
-                                         tx_size, eob);
+      lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type,
+                                             tx_size, eob);
     } break;
 
     case TX_64X32: {
@@ -812,8 +3199,8 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
         memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
         memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
       }
-      lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
-                                         tx_size, eob);
+      lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
+                                             tx_size, eob);
     } break;
 
     case TX_64X64: {
@@ -822,8 +3209,8 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
         memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
         memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
       }
-      lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
-                                         tx_size, eob);
+      lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
+                                             tx_size, eob);
     } break;
 
     default:
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
index 6af2d61e7..9ec658291 100644
--- a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
@@ -8,8 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
-#define AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#ifndef AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#define AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
 
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
@@ -23,6 +23,8 @@
 typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output,
                                   const int8_t cos_bit,
                                   const int8_t *stage_ptr);
+typedef void (*transform_neon)(int16x8_t *input, int16x8_t *output,
+                               int8_t cos_bit, int bit);
 
 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
   0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
@@ -149,4 +151,4 @@ static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
   *eoby = eob_fill[temp_eoby];
 }
 
-#endif  // AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#endif  // AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
index 0d8233744..7134f183e 100644
--- a/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
+++ b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
@@ -34,8 +34,8 @@ void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride,
   uint8x8_t tmp0, tmp1;
   uint8x16_t res_q;
   uint16x8_t res, res_low, res_high;
-  uint32x2_t tmp0_32, tmp1_32;
-  uint16x4_t tmp0_16, tmp1_16;
+  uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0);
+  uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0);
   const uint8x8_t vdup_64 = vdup_n_u8((uint8_t)64);
 
   if (w >= 16) {
diff --git a/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
index 33b06b767..194e94c8c 100644
--- a/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
+++ b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
@@ -27,8 +27,8 @@ void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride,
   uint8x8_t tmp0, tmp1;
   uint8x16_t tmp0_q, tmp1_q, res_q;
   uint16x8_t res, res_low, res_high;
-  uint32x2_t tmp0_32, tmp1_32;
-  uint16x4_t tmp0_16, tmp1_16;
+  uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0);
+  uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0);
   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
   assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
 
diff --git a/third_party/aom/av1/common/arm/cfl_neon.c b/third_party/aom/av1/common/arm/cfl_neon.c
index d731b6a66..39025b5e5 100644
--- a/third_party/aom/av1/common/arm/cfl_neon.c
+++ b/third_party/aom/av1/common/arm/cfl_neon.c
@@ -131,7 +131,7 @@ static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input,
   } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
 }
 
-#if __ARM_ARCH <= 7
+#ifndef __aarch64__
 uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
                       vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
@@ -311,7 +311,7 @@ static INLINE void subtract_average_neon(const uint16_t *src, int16_t *dst,
 
   // Permute and add in such a way that each lane contains the block sum.
   // [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A]
-#if __ARM_ARCH >= 8
+#ifdef __aarch64__
   sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
   sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
 #else
diff --git a/third_party/aom/av1/common/arm/convolve_neon.c b/third_party/aom/av1/common/arm/convolve_neon.c
index f15744c94..d0c4f8ff6 100644
--- a/third_party/aom/av1/common/arm/convolve_neon.c
+++ b/third_party/aom/av1/common/arm/convolve_neon.c
@@ -13,6 +13,8 @@
 #include <assert.h>
 #include <arm_neon.h>
 
+#include "config/av1_rtcd.h"
+
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/mem.h"
 #include "av1/common/convolve.h"
@@ -68,6 +70,33 @@ static INLINE uint8x8_t convolve8_horiz_8x8(
   return vqmovun_s16(sum);
 }
 
+#if !defined(__aarch64__)
+static INLINE uint8x8_t convolve8_horiz_4x1(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
+    const int16x4_t shift_round_0, const int16x4_t shift_by_bits) {
+  int16x4_t sum;
+
+  sum = vmul_n_s16(s0, filter[0]);
+  sum = vmla_n_s16(sum, s1, filter[1]);
+  sum = vmla_n_s16(sum, s2, filter[2]);
+  sum = vmla_n_s16(sum, s5, filter[5]);
+  sum = vmla_n_s16(sum, s6, filter[6]);
+  sum = vmla_n_s16(sum, s7, filter[7]);
+  /* filter[3] can take a max value of 128. So the max value of the result :
+   * 128*255 + sum > 16 bits
+   */
+  sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
+  sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
+
+  sum = vqrshl_s16(sum, shift_round_0);
+  sum = vqrshl_s16(sum, shift_by_bits);
+
+  return vqmovun_s16(vcombine_s16(sum, sum));
+}
+#endif  // !defined(__arch64__)
+
 static INLINE uint8x8_t convolve8_vert_8x4(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
@@ -175,7 +204,10 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   (void)conv_params;
   (void)filter_params_y;
 
-  uint8x8_t t0, t1, t2, t3;
+  uint8x8_t t0;
+#if defined(__aarch64__)
+  uint8x8_t t1, t2, t3;
+#endif
 
   assert(bits >= 0);
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
@@ -188,7 +220,7 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
 
   src -= horiz_offset;
-
+#if defined(__aarch64__)
   if (h == 4) {
     uint8x8_t d01, d23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
@@ -275,12 +307,18 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       w -= 4;
     } while (w > 0);
   } else {
+#endif
     int width;
     const uint8_t *s;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+#if defined(__aarch64__)
+    int16x8_t s8, s9, s10;
     uint8x8_t t4, t5, t6, t7;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+#endif
 
     if (w <= 4) {
+#if defined(__aarch64__)
       do {
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
@@ -387,10 +425,49 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
         }
         h -= 8;
       } while (h > 0);
+#else
+    int16x8_t tt0;
+    int16x4_t x0, x1, x2, x3, x4, x5, x6, x7;
+    const int16x4_t shift_round_0_low = vget_low_s16(shift_round_0);
+    const int16x4_t shift_by_bits_low = vget_low_s16(shift_by_bits);
+    do {
+      t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      x0 = vget_low_s16(tt0);   // a0 a1 a2 a3
+      x4 = vget_high_s16(tt0);  // a4 a5 a6 a7
+
+      t0 = vld1_u8(src + 8);  // a8 a9 a10 a11 a12 a13 a14 a15
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      x7 = vget_low_s16(tt0);  // a8 a9 a10 a11
+
+      x1 = vext_s16(x0, x4, 1);  // a1 a2 a3 a4
+      x2 = vext_s16(x0, x4, 2);  // a2 a3 a4 a5
+      x3 = vext_s16(x0, x4, 3);  // a3 a4 a5 a6
+      x5 = vext_s16(x4, x7, 1);  // a5 a6 a7 a8
+      x6 = vext_s16(x4, x7, 2);  // a6 a7 a8 a9
+      x7 = vext_s16(x4, x7, 3);  // a7 a8 a9 a10
+
+      src += src_stride;
+
+      t0 = convolve8_horiz_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter,
+                               shift_round_0_low, shift_by_bits_low);
+
+      if (w == 4) {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+                      0);  // 00 01 02 03
+        dst += dst_stride;
+      } else if (w == 2) {
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
+        dst += dst_stride;
+      }
+      h -= 1;
+    } while (h > 0);
+#endif
     } else {
       uint8_t *d;
-      int16x8_t s11, s12, s13, s14;
-
+      int16x8_t s11;
+#if defined(__aarch64__)
+      int16x8_t s12, s13, s14;
       do {
         __builtin_prefetch(src + 0 * src_stride);
         __builtin_prefetch(src + 1 * src_stride);
@@ -479,8 +556,47 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
         dst += 8 * dst_stride;
         h -= 8;
       } while (h > 0);
+#else
+    do {
+      t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+      width = w;
+      s = src + 8;
+      d = dst;
+      __builtin_prefetch(dst);
+
+      do {
+        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s11 = s0;
+        s0 = s7;
+
+        s1 = vextq_s16(s11, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        s2 = vextq_s16(s11, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        s3 = vextq_s16(s11, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        s4 = vextq_s16(s11, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        s5 = vextq_s16(s11, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        s6 = vextq_s16(s11, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        s7 = vextq_s16(s11, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                 shift_round_0, shift_by_bits);
+        vst1_u8(d, t0);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 1;
+    } while (h > 0);
+#endif
     }
+#if defined(__aarch64__)
   }
+#endif
 }
 
 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
@@ -505,9 +621,12 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
 
   if (w <= 4) {
-    uint8x8_t d01, d23;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
+    uint8x8_t d01;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+#if defined(__aarch64__)
+    uint8x8_t d23;
+    int16x4_t s8, s9, s10, d1, d2, d3;
+#endif
     s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
     src += src_stride;
     s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
@@ -526,6 +645,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
     do {
       s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
       src += src_stride;
+#if defined(__aarch64__)
       s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
       src += src_stride;
       s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
@@ -591,14 +711,41 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       s5 = s9;
       s6 = s10;
       h -= 4;
+#else
+      __builtin_prefetch(dst + 0 * dst_stride);
+      __builtin_prefetch(src + 0 * src_stride);
+
+      d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS);
+
+      if (w == 4) {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+        dst += dst_stride;
+      } else if (w == 2) {
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);
+        dst += dst_stride;
+      }
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      s5 = s6;
+      s6 = s7;
+      h -= 1;
+#endif
     } while (h > 0);
   } else {
     int height;
     const uint8_t *s;
     uint8_t *d;
-    uint8x8_t t0, t1, t2, t3;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
+    uint8x8_t t0;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+#if defined(__aarch64__)
+    uint8x8_t t1, t2, t3;
+    int16x8_t s8, s9, s10;
+#endif
     do {
       __builtin_prefetch(src + 0 * src_stride);
       __builtin_prefetch(src + 1 * src_stride);
@@ -628,6 +775,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       do {
         s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
         s += src_stride;
+#if defined(__aarch64__)
         s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
         s += src_stride;
         s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
@@ -670,6 +818,24 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
         s5 = s9;
         s6 = s10;
         height -= 4;
+#else
+        __builtin_prefetch(d);
+        __builtin_prefetch(s);
+
+        t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+        vst1_u8(d, t0);
+        d += dst_stride;
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        height -= 1;
+#endif
       } while (height > 0);
       src += 8;
       dst += 8;
@@ -686,7 +852,10 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                              ConvolveParams *conv_params) {
   int im_dst_stride;
   int width, height;
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+  uint8x8_t t0;
+#if defined(__aarch64__)
+  uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+#endif
 
   DECLARE_ALIGNED(16, int16_t,
                   im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
@@ -724,13 +893,18 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   assert(conv_params->round_0 > 0);
 
   if (w <= 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+#if defined(__aarch64__)
+    int16x4_t s8, s9, s10, d1, d2, d3;
+#endif
 
     const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
     const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
 
     do {
       s = src_ptr;
+
+#if defined(__aarch64__)
       __builtin_prefetch(s + 0 * src_stride);
       __builtin_prefetch(s + 1 * src_stride);
       __builtin_prefetch(s + 2 * src_stride);
@@ -789,16 +963,56 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       src_ptr += 4 * src_stride;
       dst_ptr += 4 * im_dst_stride;
       height -= 4;
+#else
+      int16x8_t tt0;
+
+      __builtin_prefetch(s);
+
+      t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s0 = vget_low_s16(tt0);
+      s4 = vget_high_s16(tt0);
+
+      __builtin_prefetch(dst_ptr);
+      s += 8;
+
+      t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+      s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+      s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+      s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
+      s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
+      s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
+      s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
+
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                             horiz_const, shift_round_0);
+
+      if (w == 4) {
+        vst1_s16(dst_ptr, d0);
+        dst_ptr += im_dst_stride;
+      } else if (w == 2) {
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
+        dst_ptr += im_dst_stride;
+      }
+
+      src_ptr += src_stride;
+      height -= 1;
+#endif
     } while (height > 0);
   } else {
     int16_t *d_tmp;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, res0;
+#if defined(__aarch64__)
+    int16x8_t s8, s9, s10, res1, res2, res3, res4, res5, res6, res7;
     int16x8_t s11, s12, s13, s14;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+#endif
 
     const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
     const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
 
+#if defined(__aarch64__)
     do {
       __builtin_prefetch(src_ptr + 0 * src_stride);
       __builtin_prefetch(src_ptr + 1 * src_stride);
@@ -886,6 +1100,45 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       dst_ptr += 8 * im_dst_stride;
       height -= 8;
     } while (height > 0);
+#else
+    do {
+      t0 = vld1_u8(src_ptr);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+
+      width = w;
+      s = src_ptr + 8;
+      d_tmp = dst_ptr;
+
+      __builtin_prefetch(dst_ptr);
+
+      do {
+        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t sum = s0;
+        s0 = s7;
+
+        s1 = vextq_s16(sum, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        s2 = vextq_s16(sum, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        s3 = vextq_s16(sum, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        s4 = vextq_s16(sum, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        s5 = vextq_s16(sum, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        s6 = vextq_s16(sum, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        s7 = vextq_s16(sum, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+
+        vst1q_s16(d_tmp, res0);
+
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += src_stride;
+      dst_ptr += im_dst_stride;
+      height -= 1;
+    } while (height > 0);
+#endif
   }
 
   // vertical
@@ -910,10 +1163,17 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
     width = w;
 
     if (width <= 4) {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      uint16x4_t d0, d1, d2, d3;
-      uint16x8_t dd0, dd1;
-      uint8x8_t d01, d23;
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+      uint16x4_t d0;
+      uint16x8_t dd0;
+      uint8x8_t d01;
+
+#if defined(__aarch64__)
+      int16x4_t s8, s9, s10;
+      uint16x4_t d1, d2, d3;
+      uint16x8_t dd1;
+      uint8x8_t d23;
+#endif
 
       d_u8 = dst_u8_ptr;
       v_s = v_src_ptr;
@@ -931,6 +1191,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       v_s += (7 * im_stride);
 
       do {
+#if defined(__aarch64__)
         load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
         v_s += (im_stride << 2);
 
@@ -1008,11 +1269,48 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
         s5 = s9;
         s6 = s10;
         height -= 4;
+#else
+        s7 = vld1_s16(v_s);
+        v_s += im_stride;
+
+        __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+        d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+
+        dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
+        d01 = vqmovn_u16(dd0);
+
+        if (w == 4) {
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        0);  // 00 01 02 03
+          d_u8 += dst_stride;
+
+        } else if (w == 2) {
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        0);  // 00 01
+          d_u8 += dst_stride;
+        }
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        height -= 1;
+#endif
       } while (height > 0);
     } else {
       // if width is a multiple of 8 & height is a multiple of 4
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      uint8x8_t res0, res1, res2, res3;
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+      uint8x8_t res0;
+#if defined(__aarch64__)
+      int16x8_t s8, s9, s10;
+      uint8x8_t res1, res2, res3;
+#endif
 
       do {
         __builtin_prefetch(v_src_ptr + 0 * im_stride);
@@ -1032,6 +1330,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
         height = h;
 
         do {
+#if defined(__aarch64__)
           load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
           v_s += (im_stride << 2);
 
@@ -1076,6 +1375,28 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
           s5 = s9;
           s6 = s10;
           height -= 4;
+#else
+          s7 = vld1q_s16(v_s);
+          v_s += im_stride;
+
+          __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+          res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+
+          vst1_u8(d_u8, res0);
+          d_u8 += dst_stride;
+
+          s0 = s1;
+          s1 = s2;
+          s2 = s3;
+          s3 = s4;
+          s4 = s5;
+          s5 = s6;
+          s6 = s7;
+          height -= 1;
+#endif
         } while (height > 0);
         v_src_ptr += 8;
         dst_u8_ptr += 8;
diff --git a/third_party/aom/av1/common/arm/convolve_neon.h b/third_party/aom/av1/common/arm/convolve_neon.h
index 47c93d645..f382984f2 100644
--- a/third_party/aom/av1/common/arm/convolve_neon.h
+++ b/third_party/aom/av1/common/arm/convolve_neon.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef AV1_COMMON_ARM_CONVOLVE_NEON_H_
-#define AV1_COMMON_ARM_CONVOLVE_NEON_H_
+#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
+#define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
 
 #include <arm_neon.h>
 
@@ -225,4 +225,4 @@ static INLINE uint16x4_t convolve8_4x4_s32(
   return res;
 }
 
-#endif  // AV1_COMMON_ARM_CONVOLVE_NEON_H_
+#endif  // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/jnt_convolve_neon.c b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
index 4015082b4..e5674ef7c 100644
--- a/third_party/aom/av1/common/arm/jnt_convolve_neon.c
+++ b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
@@ -22,12 +22,108 @@
 #include "av1/common/arm/mem_neon.h"
 #include "av1/common/arm/transpose_neon.h"
 
+#if !defined(__aarch64__)
+static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0,
+                                   const uint16_t fwd_offset,
+                                   const uint16_t bck_offset,
+                                   const int16x4_t sub_const_vec,
+                                   const int16_t round_bits,
+                                   const int use_jnt_comp_avg, uint8x8_t *t0) {
+  int16x4_t tmp0;
+  uint16x4_t tmp_u0;
+  uint32x4_t sum0;
+  int32x4_t dst0;
+  int16x8_t tmp4;
+
+  if (use_jnt_comp_avg) {
+    const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
+
+    sum0 = vmull_n_u16(res0, fwd_offset);
+    sum0 = vmlal_n_u16(sum0, d0, bck_offset);
+
+    sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+
+    dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), vmovl_s16(sub_const_vec));
+
+    dst0 = vqrshlq_s32(dst0, round_bits_vec);
+
+    tmp0 = vqmovn_s32(dst0);
+    tmp4 = vcombine_s16(tmp0, tmp0);
+
+    *t0 = vqmovun_s16(tmp4);
+  } else {
+    const int16x4_t round_bits_vec = vdup_n_s16(-round_bits);
+    tmp_u0 = vhadd_u16(res0, d0);
+
+    tmp0 = vsub_s16(vreinterpret_s16_u16(tmp_u0), sub_const_vec);
+
+    tmp0 = vqrshl_s16(tmp0, round_bits_vec);
+
+    tmp4 = vcombine_s16(tmp0, tmp0);
+
+    *t0 = vqmovun_s16(tmp4);
+  }
+}
+
+static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0,
+                                   const uint16_t fwd_offset,
+                                   const uint16_t bck_offset,
+                                   const int16x4_t sub_const,
+                                   const int16_t round_bits,
+                                   const int use_jnt_comp_avg, uint8x8_t *t0) {
+  int16x4_t tmp0, tmp2;
+  int16x8_t f0;
+  uint32x4_t sum0, sum2;
+  int32x4_t dst0, dst2;
+
+  uint16x8_t tmp_u0;
+
+  if (use_jnt_comp_avg) {
+    const int32x4_t sub_const_vec = vmovl_s16(sub_const);
+    const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
+
+    sum0 = vmull_n_u16(vget_low_u16(res0), fwd_offset);
+    sum0 = vmlal_n_u16(sum0, vget_low_u16(d0), bck_offset);
+    sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+
+    sum2 = vmull_n_u16(vget_high_u16(res0), fwd_offset);
+    sum2 = vmlal_n_u16(sum2, vget_high_u16(d0), bck_offset);
+    sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS);
+
+    dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), sub_const_vec);
+    dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), sub_const_vec);
+
+    dst0 = vqrshlq_s32(dst0, round_bits_vec);
+    dst2 = vqrshlq_s32(dst2, round_bits_vec);
+
+    tmp0 = vqmovn_s32(dst0);
+    tmp2 = vqmovn_s32(dst2);
+
+    f0 = vcombine_s16(tmp0, tmp2);
+
+    *t0 = vqmovun_s16(f0);
+
+  } else {
+    const int16x8_t sub_const_vec = vcombine_s16(sub_const, sub_const);
+    const int16x8_t round_bits_vec = vdupq_n_s16(-round_bits);
+
+    tmp_u0 = vhaddq_u16(res0, d0);
+
+    f0 = vsubq_s16(vreinterpretq_s16_u16(tmp_u0), sub_const_vec);
+
+    f0 = vqrshlq_s16(f0, round_bits_vec);
+
+    *t0 = vqmovun_s16(f0);
+  }
+}
+#endif  // !defined(__arch64__)
+
 static INLINE void compute_avg_4x4(
     uint16x4_t res0, uint16x4_t res1, uint16x4_t res2, uint16x4_t res3,
     uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
     const uint16_t fwd_offset, const uint16_t bck_offset,
     const int16x4_t sub_const_vec, const int16_t round_bits,
-    const int32_t use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
+    const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
   int16x4_t tmp0, tmp1, tmp2, tmp3;
   uint16x4_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
   uint32x4_t sum0, sum1, sum2, sum3;
@@ -107,7 +203,7 @@ static INLINE void compute_avg_8x4(
     uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
     const uint16_t fwd_offset, const uint16_t bck_offset,
     const int16x4_t sub_const, const int16_t round_bits,
-    const int32_t use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2,
+    const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2,
     uint8x8_t *t3) {
   int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int16x8_t f0, f1, f2, f3;
@@ -231,7 +327,6 @@ static INLINE void jnt_convolve_2d_horiz_neon(
   int16_t *dst_ptr;
   int dst_stride;
   int width, height;
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
 
   dst_ptr = im_block;
   dst_stride = im_stride;
@@ -239,15 +334,22 @@ static INLINE void jnt_convolve_2d_horiz_neon(
   width = w;
 
   if (w == 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-    int16x8_t tt0, tt1, tt2, tt3;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+    int16x8_t tt0;
+    uint8x8_t t0;
 
     const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
     const int16x4_t shift_round_0 = vdup_n_s16(-(round_0));
 
+#if defined(__aarch64__)
+    int16x4_t s8, s9, s10, d1, d2, d3;
+    int16x8_t tt1, tt2, tt3;
+    uint8x8_t t1, t2, t3;
+#endif
     do {
       s = src;
       __builtin_prefetch(s + 0 * src_stride);
+#if defined(__aarch64__)
       __builtin_prefetch(s + 1 * src_stride);
       __builtin_prefetch(s + 2 * src_stride);
       __builtin_prefetch(s + 3 * src_stride);
@@ -301,17 +403,48 @@ static INLINE void jnt_convolve_2d_horiz_neon(
       src += 4 * src_stride;
       dst_ptr += 4 * dst_stride;
       height -= 4;
+#else
+      t0 = vld1_u8(s);                            // a0 a1 a2 a3 a4 a5 a6 a7
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+      s0 = vget_low_s16(tt0);                     // a0 a1 a2 a3
+      s4 = vget_high_s16(tt0);                    // a4 a5 a6 a7
+      __builtin_prefetch(dst_ptr);
+      s += 8;
+      t0 = vld1_u8(s);  // a8 a9 a10 a11
+
+      // a8 a9 a10 a11
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+      s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+      s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+      s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
+      s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
+      s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
+      s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
+
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                             horiz_const, shift_round_0);
+
+      vst1_s16(dst_ptr, d0);
+
+      src += src_stride;
+      dst_ptr += dst_stride;
+      height -= 1;
+#endif
     } while (height > 0);
   } else {
     int16_t *d_tmp;
-    int16x8_t s11, s12, s13, s14;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    int16x8_t res0;
+    uint8x8_t t0;
 
     const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
     const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0));
-
     do {
+#if defined(__aarch64__)
+      uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+      int16x8_t s8, s9, s10, s11, s12, s13, s14;
+      int16x8_t res1, res2, res3, res4, res5, res6, res7;
       __builtin_prefetch(src + 0 * src_stride);
       __builtin_prefetch(src + 1 * src_stride);
       __builtin_prefetch(src + 2 * src_stride);
@@ -390,6 +523,42 @@ static INLINE void jnt_convolve_2d_horiz_neon(
       src += 8 * src_stride;
       dst_ptr += 8 * dst_stride;
       height -= 8;
+#else
+      int16x8_t temp_0;
+      t0 = vld1_u8(src);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+
+      width = w;
+      s = src + 8;
+      d_tmp = dst_ptr;
+      __builtin_prefetch(dst_ptr);
+
+      do {
+        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        temp_0 = s0;
+        s0 = s7;
+
+        s1 = vextq_s16(temp_0, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        s2 = vextq_s16(temp_0, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        s3 = vextq_s16(temp_0, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        s4 = vextq_s16(temp_0, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        s5 = vextq_s16(temp_0, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        s6 = vextq_s16(temp_0, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        s7 = vextq_s16(temp_0, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        vst1q_s16(d_tmp, res0);
+
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src += src_stride;
+      dst_ptr += dst_stride;
+      height -= 1;
+#endif
     } while (height > 0);
   }
 }
@@ -420,10 +589,15 @@ static INLINE void jnt_convolve_2d_vert_neon(
   const int do_average = conv_params->do_average;
   const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
 
-  int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  uint16x4_t res4, res5, res6, res7;
-  uint16x4_t d0, d1, d2, d3;
-  uint8x8_t t0, t1;
+  int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+  uint16x4_t res4, d0;
+  uint8x8_t t0;
+
+#if defined(__aarch64__)
+  int16x4_t s8, s9, s10;
+  uint16x4_t res5, res6, res7, d1, d2, d3;
+  uint8x8_t t1;
+#endif
 
   dst = conv_params->dst;
   src_ptr = im_block;
@@ -450,6 +624,7 @@ static INLINE void jnt_convolve_2d_vert_neon(
     s += (7 * im_stride);
 
     do {
+#if defined(__aarch64__)
       load_s16_4x4(s, im_stride, &s7, &s8, &s9, &s10);
       s += (im_stride << 2);
 
@@ -480,17 +655,13 @@ static INLINE void jnt_convolve_2d_vert_neon(
                         bck_offset, sub_const_vec, round_bits, use_jnt_comp_avg,
                         &t0, &t1);
 
-        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
-                      0);  // 00 01 02 03
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
         d_u8 += dst8_stride;
-        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
-                      1);  // 10 11 12 13
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1);
         d_u8 += dst8_stride;
-        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
-                      0);  // 20 21 22 23
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0);
         d_u8 += dst8_stride;
-        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
-                      1);  // 30 31 32 33
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1);
         d_u8 += dst8_stride;
 
       } else {
@@ -505,6 +676,39 @@ static INLINE void jnt_convolve_2d_vert_neon(
       s5 = s9;
       s6 = s10;
       height -= 4;
+#else
+      s7 = vld1_s16(s);
+      s += (im_stride);
+
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d_u8 + 0 * dst8_stride);
+
+      d0 = convolve8_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                             round_shift_vec, offset_const);
+
+      if (do_average) {
+        res4 = vld1_u16(d);
+        d += (dst_stride);
+
+        compute_avg_4x1(res4, d0, fwd_offset, bck_offset, sub_const_vec,
+                        round_bits, use_jnt_comp_avg, &t0);
+
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+        d_u8 += dst8_stride;
+
+      } else {
+        vst1_u16(d, d0);
+        d += (dst_stride);
+      }
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      s5 = s6;
+      s6 = s7;
+      height--;
+#endif
     } while (height > 0);
     src_ptr += 4;
     dst_ptr += 4;
@@ -722,8 +926,10 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
   uint8_t *dst_u8_ptr;
   CONV_BUF_TYPE *d, *dst_ptr;
   int width, height;
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
-
+  uint8x8_t t0;
+#if defined(__aarch64__)
+  uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+#endif
   s = src_ptr;
   dst_ptr = dst;
   dst_u8_ptr = dst8;
@@ -731,11 +937,18 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
   height = h;
 
   if ((w == 4) || (h == 4)) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-    int16x8_t tt0, tt1, tt2, tt3;
-    uint16x4_t res4, res5, res6, res7;
-    uint32x2_t tu0, tu1;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+    int16x8_t tt0;
+    uint16x4_t res4;
+#if defined(__aarch64__)
+    int16x4_t s8, s9, s10, d1, d2, d3;
+    int16x8_t tt1, tt2, tt3;
+    uint16x4_t res5, res6, res7;
+    uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0);
     int16x8_t u0, u1;
+#else
+    int16x4_t temp_0;
+#endif
     const int16x4_t zero = vdup_n_s16(0);
     const int16x4_t round_offset_vec = vdup_n_s16(round_offset);
     const int16x4_t shift_round_0 = vdup_n_s16(-conv_params->round_0 + 1);
@@ -746,6 +959,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
       d_u8 = dst_u8_ptr;
       width = w;
       __builtin_prefetch(s + 0 * src_stride);
+#if defined(__aarch64__)
       __builtin_prefetch(s + 1 * src_stride);
       __builtin_prefetch(s + 2 * src_stride);
       __builtin_prefetch(s + 3 * src_stride);
@@ -854,15 +1068,66 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
       dst_ptr += (dst_stride << 2);
       dst_u8_ptr += (dst8_stride << 2);
       height -= 4;
+#else
+      t0 = vld1_u8(s);                            // a0 a1 a2 a3 a4 a5 a6 a7
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+      s0 = vget_low_s16(tt0);                     // a0 a1 a2 a3
+      s4 = vget_high_s16(tt0);                    // a4 a5 a6 a7
+      __builtin_prefetch(d);
+
+      s += 8;
+      do {
+        t0 = vld1_u8(s);  // a8 a9 a10 a11
+
+        // a8 a9 a10 a11
+        s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        temp_0 = s7;
+        s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+        s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+        s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
+        s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
+        s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
+        s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
+
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                               zero, shift_round_0);
+        d0 = vrshl_s16(d0, horiz_const);
+        d0 = vadd_s16(d0, round_offset_vec);
+        s0 = s4;
+        s4 = temp_0;
+        if (conv_params->do_average) {
+          __builtin_prefetch(d);
+          __builtin_prefetch(d_u8);
+
+          res4 = vld1_u16(d);
+
+          compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
+                          bck_offset, round_offset_vec, round_bits,
+                          use_jnt_comp_avg, &t0);
+
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+                        0);  // 00 01 02 03
+        } else {
+          vst1_u16(d, vreinterpret_u16_s16(d0));
+        }
+
+        s += 4;
+        width -= 4;
+        d += 4;
+        d_u8 += 4;
+      } while (width > 0);
+      src_ptr += (src_stride);
+      dst_ptr += (dst_stride);
+      dst_u8_ptr += (dst8_stride);
+      height--;
+#endif
     } while (height > 0);
   } else {
     CONV_BUF_TYPE *d_tmp;
     uint8_t *d_u8_tmp;
-    int16x8_t s11, s12, s13, s14;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
-    uint16x8_t res8, res9, res10, res11;
-
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    int16x8_t res0;
+    uint16x8_t res8;
     const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
     const int16x4_t round_offset64 = vdup_n_s16(round_offset);
     const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0 + 1);
@@ -872,6 +1137,11 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
     d = dst_ptr = dst;
     d_u8 = dst_u8_ptr = dst8;
     do {
+#if defined(__aarch64__)
+      int16x8_t s11, s12, s13, s14;
+      int16x8_t s8, s9, s10;
+      int16x8_t res1, res2, res3, res4, res5, res6, res7;
+      uint16x8_t res9, res10, res11;
       __builtin_prefetch(src_ptr + 0 * src_stride);
       __builtin_prefetch(src_ptr + 1 * src_stride);
       __builtin_prefetch(src_ptr + 2 * src_stride);
@@ -1007,6 +1277,67 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
       dst_ptr += 8 * dst_stride;
       dst_u8_ptr += 8 * dst8_stride;
       height -= 8;
+#else
+      int16x8_t temp_0;
+      __builtin_prefetch(src_ptr);
+      t0 = vld1_u8(src_ptr);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+
+      width = w;
+      s = src_ptr + 8;
+      d = dst_ptr;
+      d_u8_tmp = dst_u8_ptr;
+
+      __builtin_prefetch(dst_ptr);
+
+      do {
+        d_u8 = d_u8_tmp;
+        d_tmp = d;
+
+        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        temp_0 = s0;
+        s0 = s7;
+
+        s1 = vextq_s16(temp_0, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        s2 = vextq_s16(temp_0, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        s3 = vextq_s16(temp_0, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        s4 = vextq_s16(temp_0, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        s5 = vextq_s16(temp_0, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        s6 = vextq_s16(temp_0, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        s7 = vextq_s16(temp_0, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
+                                 x_filter_tmp, zero, shift_round_0);
+
+        res0 = vrshlq_s16(res0, horiz_const);
+        res0 = vaddq_s16(res0, round_offset128);
+
+        if (conv_params->do_average) {
+          res8 = vld1q_u16(d_tmp);
+          d_tmp += (dst_stride);
+
+          compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
+                          bck_offset, round_offset64, round_bits,
+                          use_jnt_comp_avg, &t0);
+
+          vst1_u8(d_u8, t0);
+          d_u8 += (dst8_stride);
+        } else {
+          vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0));
+          d_tmp += (dst_stride);
+        }
+
+        s += 8;
+        d += 8;
+        width -= 8;
+        d_u8_tmp += 8;
+      } while (width > 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      dst_u8_ptr += dst8_stride;
+      height--;
+#endif
     } while (height > 0);
   }
 }
@@ -1057,7 +1388,6 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
   uint8_t *dst_u8_ptr;
   CONV_BUF_TYPE *d, *dst_ptr;
   int width, height;
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
 
   s = src_ptr;
   dst_ptr = dst;
@@ -1070,11 +1400,18 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
   assert((conv_params->round_1 - 2) >= bits);
 
   if ((w == 4) || (h == 4)) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-    uint16x4_t res4, res5, res6, res7;
-    uint32x2_t tu0, tu1, tu2, tu3;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+    uint16x4_t res4;
+    uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0),
+               tu3 = vdup_n_u32(0);
     int16x8_t u0, u1, u2, u3;
+    uint8x8_t t0;
 
+#if defined(__aarch64__)
+    int16x4_t s8, s9, s10, d1, d2, d3;
+    uint16x4_t res5, res6, res7;
+    uint8x8_t t1;
+#endif
     const int16x4_t round_offset64 = vdup_n_s16(round_offset);
     const int16x4_t shift_vec = vdup_n_s16(-shift_value);
     const int16x4_t zero = vdup_n_s16(0);
@@ -1111,6 +1448,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
 
       s += (7 * src_stride);
       do {
+#if defined(__aarch64__)
         load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1);
 
         u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
@@ -1154,17 +1492,13 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
                           round_offset64, round_bits, use_jnt_comp_avg, &t0,
                           &t1);
 
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
-                        0);  // 00 01 02 03
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
           d_u8 += dst8_stride;
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
-                        1);  // 10 11 12 13
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1);
           d_u8 += dst8_stride;
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
-                        0);  // 20 21 22 23
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0);
           d_u8 += dst8_stride;
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
-                        1);  // 30 31 32 33
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1);
           d_u8 += dst8_stride;
         } else {
           store_u16_4x4(d, dst_stride, vreinterpret_u16_s16(d0),
@@ -1183,6 +1517,44 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
 
         s += (src_stride << 2);
         height -= 4;
+#else
+        load_unaligned_u8_4x1(s, src_stride, &tu0);
+        u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+        s7 = vget_low_s16(u0);
+
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+                               zero, shift_vec);
+
+        d0 = vadd_s16(d0, round_offset64);
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d);
+
+          res4 = vld1_u16(d);
+          d += (dst_stride);
+
+          compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
+                          bck_offset, round_offset64, round_bits,
+                          use_jnt_comp_avg, &t0);
+
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+          d_u8 += dst8_stride;
+        } else {
+          vst1_u16(d, vreinterpret_u16_s16(d0));
+          d += (dst_stride);
+        }
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+
+        s += (src_stride);
+        height--;
+#endif
       } while (height > 0);
       src_ptr += 4;
       dst_ptr += 4;
@@ -1191,15 +1563,19 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
     } while (width > 0);
   } else {
     CONV_BUF_TYPE *d_tmp;
-    int16x8_t s11, s12, s13, s14;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
-    uint16x8_t res8, res9, res10, res11;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    int16x8_t res0;
+    uint16x8_t res8;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
     const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
     const int16x8_t shift_vec = vdupq_n_s16(-shift_value);
     const int16x4_t round_offset64 = vdup_n_s16(round_offset);
     const int16x8_t zero = vdupq_n_s16(0);
-
+#if defined(__aarch64__)
+    int16x8_t s8, s9, s10, s11, s12, s13, s14;
+    int16x8_t res1, res2, res3, res4, res5, res6, res7;
+    uint16x8_t res10, res11, res9;
+#endif
     dst_ptr = dst;
     dst_u8_ptr = dst8;
     do {
@@ -1227,6 +1603,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
       d_u8 = dst_u8_ptr;
 
       do {
+#if defined(__aarch64__)
         load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
@@ -1316,6 +1693,43 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
         s6 = s14;
         s += (8 * src_stride);
         height -= 8;
+#else
+        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+        __builtin_prefetch(dst_ptr);
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+                                 zero, shift_vec);
+        res0 = vaddq_s16(res0, round_offset128);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d_tmp);
+
+          res8 = vld1q_u16(d_tmp);
+          d_tmp += (dst_stride);
+
+          compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
+                          bck_offset, round_offset64, round_bits,
+                          use_jnt_comp_avg, &t0);
+
+          vst1_u8(d_u8, t0);
+          d_u8 += (dst8_stride);
+        } else {
+          vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0));
+          d_tmp += dst_stride;
+        }
+
+        s += (src_stride);
+        height--;
+#endif
       } while (height > 0);
       src_ptr += 8;
       dst_ptr += 8;
diff --git a/third_party/aom/av1/common/arm/mem_neon.h b/third_party/aom/av1/common/arm/mem_neon.h
index 4bf45a52c..c4ae2e784 100644
--- a/third_party/aom/av1/common/arm/mem_neon.h
+++ b/third_party/aom/av1/common/arm/mem_neon.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef AV1_COMMON_ARM_MEM_NEON_H_
-#define AV1_COMMON_ARM_MEM_NEON_H_
+#ifndef AOM_AV1_COMMON_ARM_MEM_NEON_H_
+#define AOM_AV1_COMMON_ARM_MEM_NEON_H_
 
 #include <arm_neon.h>
 #include <string.h>
@@ -362,6 +362,15 @@ static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
   *tu1 = vset_lane_u32(a, *tu1, 1);
 }
 
+static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride,
+                                         uint32x2_t *tu0) {
+  uint32_t a;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 0);
+}
+
 static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
                                          uint32x2_t *tu0) {
   uint32_t a;
@@ -482,4 +491,4 @@ static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
   vst1q_u32(s, s4);
 }
 
-#endif  // AV1_COMMON_ARM_MEM_NEON_H_
+#endif  // AOM_AV1_COMMON_ARM_MEM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/selfguided_neon.c b/third_party/aom/av1/common/arm/selfguided_neon.c
index b4808a972..b3a37c4cb 100644
--- a/third_party/aom/av1/common/arm/selfguided_neon.c
+++ b/third_party/aom/av1/common/arm/selfguided_neon.c
@@ -1007,10 +1007,11 @@ static INLINE void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0,
       vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x))));
 }
 
-void final_filter_fast_internal(uint16_t *A, int32_t *B, const int buf_stride,
-                                int16_t *src, const int src_stride,
-                                int32_t *dst, const int dst_stride,
-                                const int width, const int height) {
+static void final_filter_fast_internal(uint16_t *A, int32_t *B,
+                                       const int buf_stride, int16_t *src,
+                                       const int src_stride, int32_t *dst,
+                                       const int dst_stride, const int width,
+                                       const int height) {
   int16x8_t s0;
   int32_t *B_tmp, *dst_ptr;
   uint16_t *A_tmp;
@@ -1340,10 +1341,10 @@ static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride,
   }
 }
 
-void av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
-                                     int stride, int32_t *flt0, int32_t *flt1,
-                                     int flt_stride, int sgr_params_idx,
-                                     int bit_depth, int highbd) {
+int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
+                                    int stride, int32_t *flt0, int32_t *flt1,
+                                    int flt_stride, int sgr_params_idx,
+                                    int bit_depth, int highbd) {
   const sgr_params_type *const params = &sgr_params[sgr_params_idx];
   assert(!(params->r[0] == 0 && params->r[1] == 0));
 
@@ -1376,6 +1377,7 @@ void av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
   if (params->r[1] > 0)
     restoration_internal(dgd16, width, height, dgd16_stride, flt1, flt_stride,
                          bit_depth, sgr_params_idx, 1);
+  return 0;
 }
 
 void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
diff --git a/third_party/aom/av1/common/arm/transpose_neon.h b/third_party/aom/av1/common/arm/transpose_neon.h
index fe134087b..8a3d9f07f 100644
--- a/third_party/aom/av1/common/arm/transpose_neon.h
+++ b/third_party/aom/av1/common/arm/transpose_neon.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef AV1_COMMON_ARM_TRANSPOSE_NEON_H_
-#define AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#ifndef AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#define AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
 
 #include <arm_neon.h>
 
@@ -386,6 +386,83 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
                      vget_high_s16(vreinterpretq_s16_s32(c3.val[1])));
 }
 
+static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
+  int16x8x2_t b0;
+  b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
+                           vreinterpret_s16_s32(vget_low_s32(a1)));
+  b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
+                           vreinterpret_s16_s32(vget_high_s32(a1)));
+  return b0;
+}
+
+static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const int16x8x2_t b0 = vtrnq_s16(*a0, *(a0 + 1));
+  const int16x8x2_t b1 = vtrnq_s16(*(a0 + 2), *(a0 + 3));
+  const int16x8x2_t b2 = vtrnq_s16(*(a0 + 4), *(a0 + 5));
+  const int16x8x2_t b3 = vtrnq_s16(*(a0 + 6), *(a0 + 7));
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 04 14 24 34 44 54 64 74
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 05 15 25 35 45 55 65 75
+  // d2.val[0]: 02 12 22 32 42 52 62 72
+  // d2.val[1]: 06 16 26 36 46 56 66 76
+  // d3.val[0]: 03 13 23 33 43 53 63 73
+  // d3.val[1]: 07 17 27 37 47 57 67 77
+  const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+  *out = d0.val[0];
+  *(out + 1) = d1.val[0];
+  *(out + 2) = d2.val[0];
+  *(out + 3) = d3.val[0];
+  *(out + 4) = d0.val[1];
+  *(out + 5) = d1.val[1];
+  *(out + 6) = d2.val[1];
+  *(out + 7) = d3.val[1];
+}
+
 static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
                                       int16x4_t *a2, int16x4_t *a3) {
   // Swap 16 bit elements. Goes from:
@@ -457,4 +534,4 @@ static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
   *a3 = c1.val[1];
 }
 
-#endif  // AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#endif  // AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/warp_plane_neon.c b/third_party/aom/av1/common/arm/warp_plane_neon.c
new file mode 100644
index 000000000..7f02d42a7
--- /dev/null
+++ b/third_party/aom/av1/common/arm/warp_plane_neon.c
@@ -0,0 +1,714 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+#include <memory.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "config/av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+#include "av1/common/scale.h"
+
+/* This is a modified version of 'warped_filter' from warped_motion.c:
+   * Each coefficient is stored in 8 bits instead of 16 bits
+   * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
+
+     This is done in order to avoid overflow: Since the tap with the largest
+     coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
+     order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
+     convolve functions.
+
+     Instead, we use the summation order
+     ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
+     The rearrangement of coefficients in this table is so that we can get the
+     coefficients into the correct order more quickly.
+*/
+/* clang-format off */
+DECLARE_ALIGNED(8, static const int8_t,
+                filter_8bit_neon[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
+#if WARPEDPIXEL_PREC_BITS == 6
+  // [-1, 0)
+  { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
+  { 1, 127,  -1, 0,  -3,   4, 0, 0}, { 1, 126,  -2, 0,  -4,   6, 1, 0},
+  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 125,  -4, 0,  -6,  11, 1, 0},
+  { 1, 124,  -4, 0,  -7,  13, 1, 0}, { 2, 123,  -5, 0,  -8,  15, 1, 0},
+  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 121,  -6, 0, -10,  20, 1, 0},
+  { 2, 120,  -7, 0, -11,  22, 2, 0}, { 2, 119,  -8, 0, -12,  25, 2, 0},
+  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 116,  -9, 0, -13,  29, 2, 0},
+  { 3, 114, -10, 0, -14,  32, 3, 0}, { 3, 113, -10, 0, -15,  35, 2, 0},
+  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 109, -11, 0, -16,  40, 3, 0},
+  { 3, 108, -12, 0, -16,  42, 3, 0}, { 4, 106, -13, 0, -17,  45, 3, 0},
+  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 102, -14, 0, -17,  50, 3, 0},
+  { 4, 100, -14, 0, -17,  52, 3, 0}, { 4,  98, -15, 0, -18,  55, 4, 0},
+  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  94, -16, 0, -18,  60, 4, 0},
+  { 4,  91, -16, 0, -18,  63, 4, 0}, { 4,  89, -16, 0, -18,  65, 4, 0},
+  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  85, -17, 0, -18,  70, 4, 0},
+  { 4,  82, -17, 0, -18,  73, 4, 0}, { 4,  80, -17, 0, -18,  75, 4, 0},
+  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  75, -18, 0, -17,  80, 4, 0},
+  { 4,  73, -18, 0, -17,  82, 4, 0}, { 4,  70, -18, 0, -17,  85, 4, 0},
+  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  65, -18, 0, -16,  89, 4, 0},
+  { 4,  63, -18, 0, -16,  91, 4, 0}, { 4,  60, -18, 0, -16,  94, 4, 0},
+  { 3,  58, -18, 0, -15,  96, 4, 0}, { 4,  55, -18, 0, -15,  98, 4, 0},
+  { 3,  52, -17, 0, -14, 100, 4, 0}, { 3,  50, -17, 0, -14, 102, 4, 0},
+  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  45, -17, 0, -13, 106, 4, 0},
+  { 3,  42, -16, 0, -12, 108, 3, 0}, { 3,  40, -16, 0, -11, 109, 3, 0},
+  { 3,  37, -15, 0, -11, 111, 3, 0}, { 2,  35, -15, 0, -10, 113, 3, 0},
+  { 3,  32, -14, 0, -10, 114, 3, 0}, { 2,  29, -13, 0,  -9, 116, 3, 0},
+  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  25, -12, 0,  -8, 119, 2, 0},
+  { 2,  22, -11, 0,  -7, 120, 2, 0}, { 1,  20, -10, 0,  -6, 121, 2, 0},
+  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  15,  -8, 0,  -5, 123, 2, 0},
+  { 1,  13,  -7, 0,  -4, 124, 1, 0}, { 1,  11,  -6, 0,  -4, 125, 1, 0},
+  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 1,   6,  -4, 0,  -2, 126, 1, 0},
+  { 0,   4,  -3, 0,  -1, 127, 1, 0}, { 0,   2,  -1, 0,   0, 127, 0, 0},
+  // [0, 1)
+  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -1,   2, 0, 0, 127,   0,  0},
+  { 0,  -3,   4, 1, 1, 127,  -2,  0}, { 0,  -5,   6, 1, 1, 127,  -2,  0},
+  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -7,  11, 2, 2, 126,  -4, -1},
+  {-1,  -8,  13, 2, 3, 125,  -5, -1}, {-1, -10,  16, 3, 3, 124,  -6, -1},
+  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -12,  20, 3, 4, 122,  -7, -1},
+  {-1, -13,  23, 3, 4, 121,  -8, -1}, {-2, -14,  25, 4, 5, 120,  -9, -1},
+  {-1, -15,  27, 4, 5, 119, -10, -1}, {-1, -16,  30, 4, 5, 118, -11, -1},
+  {-2, -17,  33, 5, 6, 116, -12, -1}, {-2, -17,  35, 5, 6, 114, -12, -1},
+  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  41, 6, 7, 111, -14, -2},
+  {-2, -19,  43, 6, 7, 110, -15, -2}, {-2, -20,  46, 6, 7, 108, -15, -2},
+  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  51, 7, 7, 104, -16, -2},
+  {-2, -21,  54, 7, 7, 102, -17, -2}, {-2, -21,  56, 7, 8, 100, -18, -2},
+  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  62, 7, 8,  96, -19, -2},
+  {-2, -22,  64, 7, 8,  94, -19, -2}, {-2, -22,  67, 8, 8,  91, -20, -2},
+  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -22,  72, 8, 8,  87, -21, -2},
+  {-2, -21,  74, 8, 8,  84, -21, -2}, {-2, -22,  77, 8, 8,  82, -21, -2},
+  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  82, 8, 8,  77, -22, -2},
+  {-2, -21,  84, 8, 8,  74, -21, -2}, {-2, -21,  87, 8, 8,  72, -22, -2},
+  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -20,  91, 8, 8,  67, -22, -2},
+  {-2, -19,  94, 8, 7,  64, -22, -2}, {-2, -19,  96, 8, 7,  62, -22, -2},
+  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -18, 100, 8, 7,  56, -21, -2},
+  {-2, -17, 102, 7, 7,  54, -21, -2}, {-2, -16, 104, 7, 7,  51, -21, -2},
+  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 108, 7, 6,  46, -20, -2},
+  {-2, -15, 110, 7, 6,  43, -19, -2}, {-2, -14, 111, 7, 6,  41, -19, -2},
+  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 114, 6, 5,  35, -17, -2},
+  {-1, -12, 116, 6, 5,  33, -17, -2}, {-1, -11, 118, 5, 4,  30, -16, -1},
+  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -9, 120, 5, 4,  25, -14, -2},
+  {-1,  -8, 121, 4, 3,  23, -13, -1}, {-1,  -7, 122, 4, 3,  20, -12, -1},
+  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -6, 124, 3, 3,  16, -10, -1},
+  {-1,  -5, 125, 3, 2,  13,  -8, -1}, {-1,  -4, 126, 2, 2,  11,  -7, -1},
+  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   6,  -5,  0},
+  { 0,  -2, 127, 1, 1,   4,  -3,  0}, { 0,   0, 127, 0, 0,   2,  -1,  0},
+  // [1, 2)
+  { 0, 0, 127,   0, 0,   1,   0, 0}, { 0, 0, 127,   0, 0,  -1,   2, 0},
+  { 0, 1, 127,  -1, 0,  -3,   4, 0}, { 0, 1, 126,  -2, 0,  -4,   6, 1},
+  { 0, 1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 125,  -4, 0,  -6,  11, 1},
+  { 0, 1, 124,  -4, 0,  -7,  13, 1}, { 0, 2, 123,  -5, 0,  -8,  15, 1},
+  { 0, 2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 121,  -6, 0, -10,  20, 1},
+  { 0, 2, 120,  -7, 0, -11,  22, 2}, { 0, 2, 119,  -8, 0, -12,  25, 2},
+  { 0, 3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 116,  -9, 0, -13,  29, 2},
+  { 0, 3, 114, -10, 0, -14,  32, 3}, { 0, 3, 113, -10, 0, -15,  35, 2},
+  { 0, 3, 111, -11, 0, -15,  37, 3}, { 0, 3, 109, -11, 0, -16,  40, 3},
+  { 0, 3, 108, -12, 0, -16,  42, 3}, { 0, 4, 106, -13, 0, -17,  45, 3},
+  { 0, 4, 104, -13, 0, -17,  47, 3}, { 0, 4, 102, -14, 0, -17,  50, 3},
+  { 0, 4, 100, -14, 0, -17,  52, 3}, { 0, 4,  98, -15, 0, -18,  55, 4},
+  { 0, 4,  96, -15, 0, -18,  58, 3}, { 0, 4,  94, -16, 0, -18,  60, 4},
+  { 0, 4,  91, -16, 0, -18,  63, 4}, { 0, 4,  89, -16, 0, -18,  65, 4},
+  { 0, 4,  87, -17, 0, -18,  68, 4}, { 0, 4,  85, -17, 0, -18,  70, 4},
+  { 0, 4,  82, -17, 0, -18,  73, 4}, { 0, 4,  80, -17, 0, -18,  75, 4},
+  { 0, 4,  78, -18, 0, -18,  78, 4}, { 0, 4,  75, -18, 0, -17,  80, 4},
+  { 0, 4,  73, -18, 0, -17,  82, 4}, { 0, 4,  70, -18, 0, -17,  85, 4},
+  { 0, 4,  68, -18, 0, -17,  87, 4}, { 0, 4,  65, -18, 0, -16,  89, 4},
+  { 0, 4,  63, -18, 0, -16,  91, 4}, { 0, 4,  60, -18, 0, -16,  94, 4},
+  { 0, 3,  58, -18, 0, -15,  96, 4}, { 0, 4,  55, -18, 0, -15,  98, 4},
+  { 0, 3,  52, -17, 0, -14, 100, 4}, { 0, 3,  50, -17, 0, -14, 102, 4},
+  { 0, 3,  47, -17, 0, -13, 104, 4}, { 0, 3,  45, -17, 0, -13, 106, 4},
+  { 0, 3,  42, -16, 0, -12, 108, 3}, { 0, 3,  40, -16, 0, -11, 109, 3},
+  { 0, 3,  37, -15, 0, -11, 111, 3}, { 0, 2,  35, -15, 0, -10, 113, 3},
+  { 0, 3,  32, -14, 0, -10, 114, 3}, { 0, 2,  29, -13, 0,  -9, 116, 3},
+  { 0, 2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  25, -12, 0,  -8, 119, 2},
+  { 0, 2,  22, -11, 0,  -7, 120, 2}, { 0, 1,  20, -10, 0,  -6, 121, 2},
+  { 0, 1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  15,  -8, 0,  -5, 123, 2},
+  { 0, 1,  13,  -7, 0,  -4, 124, 1}, { 0, 1,  11,  -6, 0,  -4, 125, 1},
+  { 0, 1,   8,  -5, 0,  -3, 126, 1}, { 0, 1,   6,  -4, 0,  -2, 126, 1},
+  { 0, 0,   4,  -3, 0,  -1, 127, 1}, { 0, 0,   2,  -1, 0,   0, 127, 0},
+  // dummy (replicate row index 191)
+  { 0, 0,   2,  -1, 0,   0, 127, 0},
+
+#else
+  // [-1, 0)
+  { 0, 127,   0, 0,   0,   1, 0, 0}, { 1, 127,  -1, 0,  -3,   4, 0, 0},
+  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 124,  -4, 0,  -7,  13, 1, 0},
+  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 120,  -7, 0, -11,  22, 2, 0},
+  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 114, -10, 0, -14,  32, 3, 0},
+  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 108, -12, 0, -16,  42, 3, 0},
+  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 100, -14, 0, -17,  52, 3, 0},
+  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  91, -16, 0, -18,  63, 4, 0},
+  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  82, -17, 0, -18,  73, 4, 0},
+  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  73, -18, 0, -17,  82, 4, 0},
+  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  63, -18, 0, -16,  91, 4, 0},
+  { 3,  58, -18, 0, -15,  96, 4, 0}, { 3,  52, -17, 0, -14, 100, 4, 0},
+  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  42, -16, 0, -12, 108, 3, 0},
+  { 3,  37, -15, 0, -11, 111, 3, 0}, { 3,  32, -14, 0, -10, 114, 3, 0},
+  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  22, -11, 0,  -7, 120, 2, 0},
+  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  13,  -7, 0,  -4, 124, 1, 0},
+  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 0,   4,  -3, 0,  -1, 127, 1, 0},
+  // [0, 1)
+  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -3,   4, 1, 1, 127,  -2,  0},
+  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -8,  13, 2, 3, 125,  -5, -1},
+  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -13,  23, 3, 4, 121,  -8, -1},
+  {-1, -15,  27, 4, 5, 119, -10, -1}, {-2, -17,  33, 5, 6, 116, -12, -1},
+  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  43, 6, 7, 110, -15, -2},
+  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  54, 7, 7, 102, -17, -2},
+  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  64, 7, 8,  94, -19, -2},
+  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -21,  74, 8, 8,  84, -21, -2},
+  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  84, 8, 8,  74, -21, -2},
+  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -19,  94, 8, 7,  64, -22, -2},
+  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -17, 102, 7, 7,  54, -21, -2},
+  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 110, 7, 6,  43, -19, -2},
+  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 116, 6, 5,  33, -17, -2},
+  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -8, 121, 4, 3,  23, -13, -1},
+  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -5, 125, 3, 2,  13,  -8, -1},
+  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   4,  -3,  0},
+  // [1, 2)
+  { 0,  0, 127,   0, 0,   1,   0, 0}, { 0, 1, 127,  -1, 0,  -3,   4, 0},
+  { 0,  1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 124,  -4, 0,  -7,  13, 1},
+  { 0,  2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 120,  -7, 0, -11,  22, 2},
+  { 0,  3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 114, -10, 0, -14,  32, 3},
+  { 0,  3, 111, -11, 0, -15,  37, 3}, { 0, 3, 108, -12, 0, -16,  42, 3},
+  { 0,  4, 104, -13, 0, -17,  47, 3}, { 0, 4, 100, -14, 0, -17,  52, 3},
+  { 0,  4,  96, -15, 0, -18,  58, 3}, { 0, 4,  91, -16, 0, -18,  63, 4},
+  { 0,  4,  87, -17, 0, -18,  68, 4}, { 0, 4,  82, -17, 0, -18,  73, 4},
+  { 0,  4,  78, -18, 0, -18,  78, 4}, { 0, 4,  73, -18, 0, -17,  82, 4},
+  { 0,  4,  68, -18, 0, -17,  87, 4}, { 0, 4,  63, -18, 0, -16,  91, 4},
+  { 0,  3,  58, -18, 0, -15,  96, 4}, { 0, 3,  52, -17, 0, -14, 100, 4},
+  { 0,  3,  47, -17, 0, -13, 104, 4}, { 0, 3,  42, -16, 0, -12, 108, 3},
+  { 0,  3,  37, -15, 0, -11, 111, 3}, { 0, 3,  32, -14, 0, -10, 114, 3},
+  { 0,  2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  22, -11, 0,  -7, 120, 2},
+  { 0,  1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  13,  -7, 0,  -4, 124, 1},
+  { 0,  1,   8,  -5, 0,  -3, 126, 1}, { 0, 0,   4,  -3, 0,  -1, 127, 1},
+  // dummy (replicate row index 95)
+  { 0, 0,   4,  -3, 0,  -1, 127, 1},
+#endif  // WARPEDPIXEL_PREC_BITS == 6
+};
+/* clang-format on */
+
+static INLINE void convolve(int32x2x2_t x0, int32x2x2_t x1, uint8x8_t src_0,
+                            uint8x8_t src_1, int16x4_t *res) {
+  int16x8_t coeff_0, coeff_1;
+  int16x8_t pix_0, pix_1;
+
+  coeff_0 = vcombine_s16(vreinterpret_s16_s32(x0.val[0]),
+                         vreinterpret_s16_s32(x1.val[0]));
+  coeff_1 = vcombine_s16(vreinterpret_s16_s32(x0.val[1]),
+                         vreinterpret_s16_s32(x1.val[1]));
+
+  pix_0 = vreinterpretq_s16_u16(vmovl_u8(src_0));
+  pix_0 = vmulq_s16(coeff_0, pix_0);
+
+  pix_1 = vreinterpretq_s16_u16(vmovl_u8(src_1));
+  pix_0 = vmlaq_s16(pix_0, coeff_1, pix_1);
+
+  *res = vpadd_s16(vget_low_s16(pix_0), vget_high_s16(pix_0));
+}
+
+static INLINE void horizontal_filter_neon(uint8x16_t src_1, uint8x16_t src_2,
+                                          uint8x16_t src_3, uint8x16_t src_4,
+                                          int16x8_t *tmp_dst, int sx, int alpha,
+                                          int k, const int offset_bits_horiz,
+                                          const int reduce_bits_horiz) {
+  const uint8x16_t mask = { 255, 0, 255, 0, 255, 0, 255, 0,
+                            255, 0, 255, 0, 255, 0, 255, 0 };
+  const int32x4_t add_const = vdupq_n_s32((int32_t)(1 << offset_bits_horiz));
+  const int16x8_t shift = vdupq_n_s16(-(int16_t)reduce_bits_horiz);
+
+  int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
+  int32x2x2_t b0, b1;
+  uint8x8_t src_1_low, src_2_low, src_3_low, src_4_low, src_5_low, src_6_low;
+  int32x4_t tmp_res_low, tmp_res_high;
+  uint16x8_t res;
+  int16x4_t res_0246_even, res_0246_odd, res_1357_even, res_1357_odd;
+
+  uint8x16_t tmp_0 = vandq_u8(src_1, mask);
+  uint8x16_t tmp_1 = vandq_u8(src_2, mask);
+  uint8x16_t tmp_2 = vandq_u8(src_3, mask);
+  uint8x16_t tmp_3 = vandq_u8(src_4, mask);
+
+  tmp_2 = vextq_u8(tmp_0, tmp_0, 1);
+  tmp_3 = vextq_u8(tmp_1, tmp_1, 1);
+
+  src_1 = vaddq_u8(tmp_0, tmp_2);
+  src_2 = vaddq_u8(tmp_1, tmp_3);
+
+  src_1_low = vget_low_u8(src_1);
+  src_2_low = vget_low_u8(src_2);
+  src_3_low = vget_low_u8(vextq_u8(src_1, src_1, 4));
+  src_4_low = vget_low_u8(vextq_u8(src_2, src_2, 4));
+  src_5_low = vget_low_u8(vextq_u8(src_1, src_1, 2));
+  src_6_low = vget_low_u8(vextq_u8(src_1, src_1, 6));
+
+  // Loading the 8 filter taps
+  f0 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f1 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f2 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f3 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f4 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f5 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f6 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f7 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]));
+
+  b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f0)),
+                vreinterpret_s32_s16(vget_low_s16(f2)));
+  b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f4)),
+                vreinterpret_s32_s16(vget_low_s16(f6)));
+  convolve(b0, b1, src_1_low, src_3_low, &res_0246_even);
+
+  b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f1)),
+                vreinterpret_s32_s16(vget_low_s16(f3)));
+  b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f5)),
+                vreinterpret_s32_s16(vget_low_s16(f7)));
+  convolve(b0, b1, src_2_low, src_4_low, &res_0246_odd);
+
+  b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f0)),
+                vreinterpret_s32_s16(vget_high_s16(f2)));
+  b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f4)),
+                vreinterpret_s32_s16(vget_high_s16(f6)));
+  convolve(b0, b1, src_2_low, src_4_low, &res_1357_even);
+
+  b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f1)),
+                vreinterpret_s32_s16(vget_high_s16(f3)));
+  b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f5)),
+                vreinterpret_s32_s16(vget_high_s16(f7)));
+  convolve(b0, b1, src_5_low, src_6_low, &res_1357_odd);
+
+  tmp_res_low = vaddl_s16(res_0246_even, res_1357_even);
+  tmp_res_high = vaddl_s16(res_0246_odd, res_1357_odd);
+
+  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+  tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+  res = vcombine_u16(vqmovun_s32(tmp_res_low), vqmovun_s32(tmp_res_high));
+  res = vqrshlq_u16(res, shift);
+
+  tmp_dst[k + 7] = vreinterpretq_s16_u16(res);
+}
+
+static INLINE void vertical_filter_neon(const int16x8_t *src,
+                                        int32x4_t *res_low, int32x4_t *res_high,
+                                        int sy, int gamma) {
+  int16x4_t src_0, src_1, fltr_0, fltr_1;
+  int32x4_t res_0, res_1;
+  int32x2_t res_0_im, res_1_im;
+  int32x4_t res_even, res_odd, im_res_0, im_res_1;
+
+  int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
+  int16x8x2_t b0, b1, b2, b3;
+  int32x4x2_t c0, c1, c2, c3;
+  int32x4x2_t d0, d1, d2, d3;
+
+  b0 = vtrnq_s16(src[0], src[1]);
+  b1 = vtrnq_s16(src[2], src[3]);
+  b2 = vtrnq_s16(src[4], src[5]);
+  b3 = vtrnq_s16(src[6], src[7]);
+
+  c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                 vreinterpretq_s32_s16(b0.val[1]));
+  c1 = vtrnq_s32(vreinterpretq_s32_s16(b1.val[0]),
+                 vreinterpretq_s32_s16(b1.val[1]));
+  c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                 vreinterpretq_s32_s16(b2.val[1]));
+  c3 = vtrnq_s32(vreinterpretq_s32_s16(b3.val[0]),
+                 vreinterpretq_s32_s16(b3.val[1]));
+
+  f0 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f1 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f2 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f3 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f4 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f5 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f6 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f7 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  d0 = vtrnq_s32(vreinterpretq_s32_s16(f0), vreinterpretq_s32_s16(f2));
+  d1 = vtrnq_s32(vreinterpretq_s32_s16(f4), vreinterpretq_s32_s16(f6));
+  d2 = vtrnq_s32(vreinterpretq_s32_s16(f1), vreinterpretq_s32_s16(f3));
+  d3 = vtrnq_s32(vreinterpretq_s32_s16(f5), vreinterpretq_s32_s16(f7));
+
+  // row:0,1 even_col:0,2
+  src_0 = vget_low_s16(vreinterpretq_s16_s32(c0.val[0]));
+  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[0]));
+  res_0 = vmull_s16(src_0, fltr_0);
+
+  // row:0,1,2,3 even_col:0,2
+  src_0 = vget_low_s16(vreinterpretq_s16_s32(c1.val[0]));
+  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[1]));
+  res_0 = vmlal_s16(res_0, src_0, fltr_0);
+  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+  // row:0,1 even_col:4,6
+  src_1 = vget_low_s16(vreinterpretq_s16_s32(c0.val[1]));
+  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[0]));
+  res_1 = vmull_s16(src_1, fltr_1);
+
+  // row:0,1,2,3 even_col:4,6
+  src_1 = vget_low_s16(vreinterpretq_s16_s32(c1.val[1]));
+  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[1]));
+  res_1 = vmlal_s16(res_1, src_1, fltr_1);
+  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+  // row:0,1,2,3 even_col:0,2,4,6
+  im_res_0 = vcombine_s32(res_0_im, res_1_im);
+
+  // row:4,5 even_col:0,2
+  src_0 = vget_low_s16(vreinterpretq_s16_s32(c2.val[0]));
+  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[0]));
+  res_0 = vmull_s16(src_0, fltr_0);
+
+  // row:4,5,6,7 even_col:0,2
+  src_0 = vget_low_s16(vreinterpretq_s16_s32(c3.val[0]));
+  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[1]));
+  res_0 = vmlal_s16(res_0, src_0, fltr_0);
+  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+  // row:4,5 even_col:4,6
+  src_1 = vget_low_s16(vreinterpretq_s16_s32(c2.val[1]));
+  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[0]));
+  res_1 = vmull_s16(src_1, fltr_1);
+
+  // row:4,5,6,7 even_col:4,6
+  src_1 = vget_low_s16(vreinterpretq_s16_s32(c3.val[1]));
+  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[1]));
+  res_1 = vmlal_s16(res_1, src_1, fltr_1);
+  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+  // row:4,5,6,7 even_col:0,2,4,6
+  im_res_1 = vcombine_s32(res_0_im, res_1_im);
+
+  // row:0-7 even_col:0,2,4,6
+  res_even = vaddq_s32(im_res_0, im_res_1);
+
+  // row:0,1 odd_col:1,3
+  src_0 = vget_high_s16(vreinterpretq_s16_s32(c0.val[0]));
+  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[0]));
+  res_0 = vmull_s16(src_0, fltr_0);
+
+  // row:0,1,2,3 odd_col:1,3
+  src_0 = vget_high_s16(vreinterpretq_s16_s32(c1.val[0]));
+  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[1]));
+  res_0 = vmlal_s16(res_0, src_0, fltr_0);
+  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+  // row:0,1 odd_col:5,7
+  src_1 = vget_high_s16(vreinterpretq_s16_s32(c0.val[1]));
+  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[0]));
+  res_1 = vmull_s16(src_1, fltr_1);
+
+  // row:0,1,2,3 odd_col:5,7
+  src_1 = vget_high_s16(vreinterpretq_s16_s32(c1.val[1]));
+  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[1]));
+  res_1 = vmlal_s16(res_1, src_1, fltr_1);
+  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+  // row:0,1,2,3 odd_col:1,3,5,7
+  im_res_0 = vcombine_s32(res_0_im, res_1_im);
+
+  // row:4,5 odd_col:1,3
+  src_0 = vget_high_s16(vreinterpretq_s16_s32(c2.val[0]));
+  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[0]));
+  res_0 = vmull_s16(src_0, fltr_0);
+
+  // row:4,5,6,7 odd_col:1,3
+  src_0 = vget_high_s16(vreinterpretq_s16_s32(c3.val[0]));
+  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[1]));
+  res_0 = vmlal_s16(res_0, src_0, fltr_0);
+  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+  // row:4,5 odd_col:5,7
+  src_1 = vget_high_s16(vreinterpretq_s16_s32(c2.val[1]));
+  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[0]));
+  res_1 = vmull_s16(src_1, fltr_1);
+
+  // row:4,5,6,7 odd_col:5,7
+  src_1 = vget_high_s16(vreinterpretq_s16_s32(c3.val[1]));
+  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[1]));
+  res_1 = vmlal_s16(res_1, src_1, fltr_1);
+  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+  // row:4,5,6,7 odd_col:1,3,5,7
+  im_res_1 = vcombine_s32(res_0_im, res_1_im);
+
+  // row:0-7 odd_col:1,3,5,7
+  res_odd = vaddq_s32(im_res_0, im_res_1);
+
+  // reordering as 0 1 2 3 | 4 5 6 7
+  c0 = vtrnq_s32(res_even, res_odd);
+
+  // Final store
+  *res_low = vcombine_s32(vget_low_s32(c0.val[0]), vget_low_s32(c0.val[1]));
+  *res_high = vcombine_s32(vget_high_s32(c0.val[0]), vget_high_s32(c0.val[1]));
+}
+
+void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
+                          int height, int stride, uint8_t *pred, int p_col,
+                          int p_row, int p_width, int p_height, int p_stride,
+                          int subsampling_x, int subsampling_y,
+                          ConvolveParams *conv_params, int16_t alpha,
+                          int16_t beta, int16_t gamma, int16_t delta) {
+  int16x8_t tmp[15];
+  const int bd = 8;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const int32x4_t fwd = vdupq_n_s32((int32_t)w0);
+  const int32x4_t bwd = vdupq_n_s32((int32_t)w1);
+  const int16x8_t sub_constant = vdupq_n_s16((1 << (bd - 1)) + (1 << bd));
+
+  int limit = 0;
+  uint8x16_t vec_dup, mask_val;
+  int32x4_t res_lo, res_hi;
+  int16x8_t result_final;
+  uint8x16_t src_1, src_2, src_3, src_4;
+  uint8x16_t indx_vec = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  };
+  uint8x16_t cmp_vec;
+
+  const int reduce_bits_horiz = conv_params->round_0;
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int32x4_t shift_vert = vdupq_n_s32(-(int32_t)reduce_bits_vert);
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  int32x4_t add_const_vert = vdupq_n_s32((int32_t)(1 << offset_bits_vert));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int16x4_t round_bits_vec = vdup_n_s16(-(int16_t)round_bits);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int16x4_t res_sub_const =
+      vdup_n_s16(-((1 << (offset_bits - conv_params->round_1)) +
+                   (1 << (offset_bits - conv_params->round_1 - 1))));
+  int k;
+
+  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+  for (int i = 0; i < p_height; i += 8) {
+    for (int j = 0; j < p_width; j += 8) {
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      // horizontal
+      if (ix4 <= -7) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int16_t dup_val =
+              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz));
+
+          tmp[k + 7] = vdupq_n_s16(dup_val);
+        }
+      } else if (ix4 >= width + 6) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+                            ref[iy * stride + (width - 1)] *
+                                (1 << (FILTER_BITS - reduce_bits_horiz));
+          tmp[k + 7] = vdupq_n_s16(dup_val);
+        }
+      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+        const int out_of_boundary_left = -(ix4 - 6);
+        const int out_of_boundary_right = (ix4 + 8) - width;
+
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          const uint8_t *src = ref + iy * stride + ix4 - 7;
+          src_1 = vld1q_u8(src);
+
+          if (out_of_boundary_left >= 0) {
+            limit = out_of_boundary_left + 1;
+            cmp_vec = vdupq_n_u8(out_of_boundary_left);
+            vec_dup = vdupq_n_u8(*(src + limit));
+            mask_val = vcleq_u8(indx_vec, cmp_vec);
+            src_1 = vbslq_u8(mask_val, vec_dup, src_1);
+          }
+          if (out_of_boundary_right >= 0) {
+            limit = 15 - (out_of_boundary_right + 1);
+            cmp_vec = vdupq_n_u8(15 - out_of_boundary_right);
+            vec_dup = vdupq_n_u8(*(src + limit));
+            mask_val = vcgeq_u8(indx_vec, cmp_vec);
+            src_1 = vbslq_u8(mask_val, vec_dup, src_1);
+          }
+          src_2 = vextq_u8(src_1, src_1, 1);
+          src_3 = vextq_u8(src_2, src_2, 1);
+          src_4 = vextq_u8(src_3, src_3, 1);
+
+          horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
+                                 offset_bits_horiz, reduce_bits_horiz);
+        }
+      } else {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          const uint8_t *src = ref + iy * stride + ix4 - 7;
+          src_1 = vld1q_u8(src);
+          src_2 = vextq_u8(src_1, src_1, 1);
+          src_3 = vextq_u8(src_2, src_2, 1);
+          src_4 = vextq_u8(src_3, src_3, 1);
+
+          horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
+                                 offset_bits_horiz, reduce_bits_horiz);
+        }
+      }
+
+      // vertical
+      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+        int sy = sy4 + delta * (k + 4);
+
+        const int16x8_t *v_src = tmp + (k + 4);
+
+        vertical_filter_neon(v_src, &res_lo, &res_hi, sy, gamma);
+
+        res_lo = vaddq_s32(res_lo, add_const_vert);
+        res_hi = vaddq_s32(res_hi, add_const_vert);
+
+        if (conv_params->is_compound) {
+          uint16_t *const p =
+              (uint16_t *)&conv_params
+                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
+
+          res_lo = vrshlq_s32(res_lo, shift_vert);
+          if (conv_params->do_average) {
+            uint8_t *const dst8 = &pred[(i + k + 4) * p_stride + j];
+            uint16x4_t tmp16_lo = vld1_u16(p);
+            int32x4_t tmp32_lo = vreinterpretq_s32_u32(vmovl_u16(tmp16_lo));
+            int16x4_t tmp16_low;
+            if (conv_params->use_jnt_comp_avg) {
+              res_lo = vmulq_s32(res_lo, bwd);
+              tmp32_lo = vmulq_s32(tmp32_lo, fwd);
+              tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
+              tmp16_low = vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS);
+            } else {
+              tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
+              tmp16_low = vshrn_n_s32(tmp32_lo, 1);
+            }
+            int16x4_t res_low = vadd_s16(tmp16_low, res_sub_const);
+            res_low = vqrshl_s16(res_low, round_bits_vec);
+            int16x8_t final_res_low = vcombine_s16(res_low, res_low);
+            uint8x8_t res_8_low = vqmovun_s16(final_res_low);
+
+            vst1_lane_u32((uint32_t *)dst8, vreinterpret_u32_u8(res_8_low), 0);
+          } else {
+            uint16x4_t res_u16_low = vqmovun_s32(res_lo);
+            vst1_u16(p, res_u16_low);
+          }
+          if (p_width > 4) {
+            uint16_t *const p4 =
+                (uint16_t *)&conv_params
+                    ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+
+            res_hi = vrshlq_s32(res_hi, shift_vert);
+            if (conv_params->do_average) {
+              uint8_t *const dst8_4 = &pred[(i + k + 4) * p_stride + j + 4];
+
+              uint16x4_t tmp16_hi = vld1_u16(p4);
+              int32x4_t tmp32_hi = vreinterpretq_s32_u32(vmovl_u16(tmp16_hi));
+              int16x4_t tmp16_high;
+              if (conv_params->use_jnt_comp_avg) {
+                res_hi = vmulq_s32(res_hi, bwd);
+                tmp32_hi = vmulq_s32(tmp32_hi, fwd);
+                tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
+                tmp16_high = vshrn_n_s32(tmp32_hi, DIST_PRECISION_BITS);
+              } else {
+                tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
+                tmp16_high = vshrn_n_s32(tmp32_hi, 1);
+              }
+              int16x4_t res_high = vadd_s16(tmp16_high, res_sub_const);
+              res_high = vqrshl_s16(res_high, round_bits_vec);
+              int16x8_t final_res_high = vcombine_s16(res_high, res_high);
+              uint8x8_t res_8_high = vqmovun_s16(final_res_high);
+
+              vst1_lane_u32((uint32_t *)dst8_4, vreinterpret_u32_u8(res_8_high),
+                            0);
+            } else {
+              uint16x4_t res_u16_high = vqmovun_s32(res_hi);
+              vst1_u16(p4, res_u16_high);
+            }
+          }
+        } else {
+          res_lo = vrshlq_s32(res_lo, shift_vert);
+          res_hi = vrshlq_s32(res_hi, shift_vert);
+
+          result_final = vcombine_s16(vmovn_s32(res_lo), vmovn_s32(res_hi));
+          result_final = vsubq_s16(result_final, sub_constant);
+
+          uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j];
+          uint8x8_t val = vqmovun_s16(result_final);
+
+          if (p_width == 4) {
+            vst1_lane_u32((uint32_t *)p, vreinterpret_u32_u8(val), 0);
+          } else {
+            vst1_u8(p, val);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/arm/wiener_convolve_neon.c b/third_party/aom/av1/common/arm/wiener_convolve_neon.c
index 72fbed4d4..a9bb5bcf0 100644
--- a/third_party/aom/av1/common/arm/wiener_convolve_neon.c
+++ b/third_party/aom/av1/common/arm/wiener_convolve_neon.c
@@ -26,7 +26,6 @@
    Apply horizontal filter and store in a temporary buffer. When applying
    vertical filter, overwrite the original pixel values.
  */
-
 void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
                                       uint8_t *dst, ptrdiff_t dst_stride,
                                       const int16_t *filter_x, int x_step_q4,
@@ -78,8 +77,10 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
   /* if height is a multiple of 8 */
   if (!(h & 7)) {
     int16x8_t res0, res1, res2, res3;
-    uint16x8_t res4, res5, res6, res7, res8, res9, res10, res11;
+    uint16x8_t res4;
     uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+#if defined(__aarch64__)
+    uint16x8_t res5, res6, res7, res8, res9, res10, res11;
     uint8x8_t t8, t9, t10, t11, t12, t13, t14;
 
     do {
@@ -190,16 +191,64 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
       dst_ptr += 8 * MAX_SB_SIZE;
       height -= 8;
     } while (height > 0);
+#else
+    uint8x8_t temp_0;
+
+    do {
+      const uint8_t *s;
+
+      __builtin_prefetch(src_ptr);
+
+      t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
+      s = src_ptr + 8;
+      d_tmp = dst_ptr;
+      width = w;
+
+      __builtin_prefetch(dst_ptr);
+
+      do {
+        t7 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        temp_0 = t0;
+        t0 = t7;
+
+        t1 = vext_u8(temp_0, t7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        t2 = vext_u8(temp_0, t7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        t3 = vext_u8(temp_0, t7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        t4 = vext_u8(temp_0, t7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        t5 = vext_u8(temp_0, t7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        t6 = vext_u8(temp_0, t7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        t7 = vext_u8(temp_0, t7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        vst1q_u16(d_tmp, res4);
+
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += src_stride;
+      dst_ptr += MAX_SB_SIZE;
+      height--;
+    } while (height > 0);
+#endif
   } else {
     /*if height is a multiple of 4*/
-    int16x8_t tt0, tt1, tt2, tt3;
     const uint8_t *s;
+    int16x8_t tt0, tt1, tt2, tt3;
+    uint16x8_t d0;
+    uint8x8_t t0, t1, t2, t3;
+
+#if defined(__aarch64__)
     uint16x4_t res0, res1, res2, res3, res4, res5, res6, res7;
-    uint16x8_t d0, d1, d2, d3;
+    uint16x8_t d1, d2, d3;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
     int16x4_t s11, s12, s13, s14;
-    uint8x8_t t0, t1, t2, t3;
-
     do {
       __builtin_prefetch(src_ptr + 0 * src_stride);
       __builtin_prefetch(src_ptr + 1 * src_stride);
@@ -292,11 +341,61 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
       dst_ptr += 4 * MAX_SB_SIZE;
       height -= 4;
     } while (height > 0);
+#else
+    uint8x8_t temp_0, t4, t5, t6, t7;
+
+    do {
+      __builtin_prefetch(src_ptr);
+
+      t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
+
+      __builtin_prefetch(dst_ptr);
+
+      s = src_ptr + 8;
+      d_tmp = dst_ptr;
+      width = w;
+
+      do {
+        t7 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        temp_0 = t0;
+        t0 = t7;
+
+        t1 = vext_u8(temp_0, t7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        t2 = vext_u8(temp_0, t7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        t3 = vext_u8(temp_0, t7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        t4 = vext_u8(temp_0, t7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        t5 = vext_u8(temp_0, t7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        t6 = vext_u8(temp_0, t7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        t7 = vext_u8(temp_0, t7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        tt0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
+        tt1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+        tt2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+        tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        d0 = wiener_convolve8_horiz_8x8(tt0, tt1, tt2, tt3, filter_x_tmp, bd,
+                                        conv_params->round_0);
+
+        vst1q_u16(d_tmp, d0);
+
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+
+      src_ptr += src_stride;
+      dst_ptr += MAX_SB_SIZE;
+      height -= 1;
+    } while (height > 0);
+#endif
   }
 
   {
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    uint8x8_t t0, t1, t2, t3;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    uint8x8_t t0;
+#if defined(__aarch64__)
+    int16x8_t s8, s9, s10;
+    uint8x8_t t1, t2, t3;
+#endif
     int16_t *src_tmp_ptr, *s;
     uint8_t *dst_tmp_ptr;
     height = h;
@@ -324,6 +423,7 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
       d = dst_tmp_ptr;
       height = h;
 
+#if defined(__aarch64__)
       do {
         __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
         __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
@@ -397,5 +497,34 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
 
       w -= 8;
     } while (w > 0);
+#else
+      do {
+        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
+
+        s7 = vld1q_s16(s);
+        s += src_stride;
+
+        t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
+                                       bd, conv_params->round_1);
+
+        vst1_u8(d, t0);
+        d += dst_stride;
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        height -= 1;
+      } while (height > 0);
+
+      src_tmp_ptr += 8;
+      dst_tmp_ptr += 8;
+
+      w -= 8;
+    } while (w > 0);
+#endif
   }
 }
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.c b/third_party/aom/av1/common/av1_inv_txfm1d.c
index 8514dc64c..7ef2d6d7f 100644
--- a/third_party/aom/av1/common/av1_inv_txfm1d.c
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.c
@@ -11,56 +11,7 @@
 
 #include <stdlib.h>
 #include "av1/common/av1_inv_txfm1d.h"
-
-static void range_check_buf(int32_t stage, const int32_t *input,
-                            const int32_t *buf, int32_t size, int8_t bit) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-  const int64_t max_value = (1LL << (bit - 1)) - 1;
-  const int64_t min_value = -(1LL << (bit - 1));
-
-  int in_range = 1;
-
-  for (int i = 0; i < size; ++i) {
-    if (buf[i] < min_value || buf[i] > max_value) {
-      in_range = 0;
-    }
-  }
-
-  if (!in_range) {
-    fprintf(stderr, "Error: coeffs contain out-of-range values\n");
-    fprintf(stderr, "size: %d\n", size);
-    fprintf(stderr, "stage: %d\n", stage);
-    fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value,
-            max_value);
-
-    fprintf(stderr, "coeffs: ");
-
-    fprintf(stderr, "[");
-    for (int j = 0; j < size; j++) {
-      if (j > 0) fprintf(stderr, ", ");
-      fprintf(stderr, "%d", input[j]);
-    }
-    fprintf(stderr, "]\n");
-
-    fprintf(stderr, "   buf: ");
-
-    fprintf(stderr, "[");
-    for (int j = 0; j < size; j++) {
-      if (j > 0) fprintf(stderr, ", ");
-      fprintf(stderr, "%d", buf[j]);
-    }
-    fprintf(stderr, "]\n\n");
-  }
-
-  assert(in_range);
-#else
-  (void)stage;
-  (void)input;
-  (void)buf;
-  (void)size;
-  (void)bit;
-#endif
-}
+#include "av1/common/av1_txfm.h"
 
 // TODO(angiebird): Make 1-d txfm functions static
 //
@@ -84,7 +35,7 @@ void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[1] = input[2];
   bf1[2] = input[1];
   bf1[3] = input[3];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -94,7 +45,7 @@ void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
   bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
   bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -129,7 +80,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = input[5];
   bf1[6] = input[3];
   bf1[7] = input[7];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -143,7 +94,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
   bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
   bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -157,7 +108,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
   bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
   bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
@@ -171,7 +122,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
   bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
   bf1[7] = bf0[7];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
@@ -218,7 +169,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = input[11];
   bf1[14] = input[7];
   bf1[15] = input[15];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -240,7 +191,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
   bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
   bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -262,7 +213,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
   bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
   bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
@@ -284,7 +235,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
   bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
   bf1[15] = bf0[15];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
@@ -306,7 +257,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
   bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
   bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
@@ -328,7 +279,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
   bf1[14] = bf0[14];
   bf1[15] = bf0[15];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
@@ -399,7 +350,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = input[23];
   bf1[30] = input[15];
   bf1[31] = input[31];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -437,7 +388,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
   bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
   bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -475,7 +426,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
   bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
   bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
@@ -513,7 +464,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
   bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
   bf1[31] = bf0[31];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
@@ -551,7 +502,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
   bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
   bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
@@ -589,7 +540,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
   bf1[30] = bf0[30];
   bf1[31] = bf0[31];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
@@ -627,7 +578,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
   bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
   bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 8
   stage++;
@@ -665,7 +616,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = bf0[29];
   bf1[30] = bf0[30];
   bf1[31] = bf0[31];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 9
   stage++;
@@ -760,7 +711,6 @@ void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   output[1] = round_shift(x1, bit);
   output[2] = round_shift(x2, bit);
   output[3] = round_shift(x3, bit);
-  range_check_buf(6, input, output, 4, stage_range[6]);
 }
 
 void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -786,7 +736,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = input[4];
   bf1[6] = input[1];
   bf1[7] = input[6];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -800,7 +750,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
   bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
   bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -814,7 +764,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
   bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
   bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
@@ -828,7 +778,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
   bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
   bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
@@ -842,7 +792,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
   bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
   bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
@@ -856,7 +806,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = bf0[5];
   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
   bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
@@ -903,7 +853,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = input[12];
   bf1[14] = input[1];
   bf1[15] = input[14];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -925,7 +875,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
   bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
   bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -947,7 +897,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]);
   bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]);
   bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
@@ -969,7 +919,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
   bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
   bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
@@ -991,7 +941,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]);
   bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]);
   bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
@@ -1013,7 +963,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
   bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
   bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
@@ -1035,7 +985,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]);
   bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]);
   bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 8
   stage++;
@@ -1057,7 +1007,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = bf0[13];
   bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
   bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 9
   stage++;
@@ -1193,7 +1143,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = input[47];
   bf1[62] = input[31];
   bf1[63] = input[63];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -1263,7 +1213,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit);
   bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit);
   bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -1333,7 +1283,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]);
   bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]);
   bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
@@ -1403,7 +1353,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit);
   bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit);
   bf1[63] = bf0[63];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
@@ -1473,7 +1423,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]);
   bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]);
   bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
@@ -1543,7 +1493,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit);
   bf1[62] = bf0[62];
   bf1[63] = bf0[63];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
@@ -1613,7 +1563,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]);
   bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]);
   bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 8
   stage++;
@@ -1683,7 +1633,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = bf0[61];
   bf1[62] = bf0[62];
   bf1[63] = bf0[63];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 9
   stage++;
@@ -1753,7 +1703,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]);
   bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]);
   bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 10
   stage++;
@@ -1823,7 +1773,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = bf0[61];
   bf1[62] = bf0[62];
   bf1[63] = bf0[63];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 11
   stage++;
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.h b/third_party/aom/av1/common/av1_inv_txfm1d.h
index 64a1a921c..c31c019aa 100644
--- a/third_party/aom/av1/common/av1_inv_txfm1d.h
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_INV_TXFM1D_H_
-#define AV1_INV_TXFM1D_H_
+#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
+#define AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
 
 #include "av1/common/av1_txfm.h"
 
@@ -58,4 +58,4 @@ void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
 }
 #endif
 
-#endif  // AV1_INV_TXFM1D_H_
+#endif  // AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
index 4c600f756..7d80a0099 100644
--- a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
+++ b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_INV_TXFM2D_CFG_H_
-#define AV1_INV_TXFM2D_CFG_H_
+#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
+#define AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
 #include "av1/common/av1_inv_txfm1d.h"
 
 // sum of fwd_shift_##
@@ -44,4 +44,4 @@ extern const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL];
 extern const int8_t inv_cos_bit_col[5 /*row*/][5 /*col*/];
 extern const int8_t inv_cos_bit_row[5 /*row*/][5 /*col*/];
 
-#endif  // AV1_INV_TXFM2D_CFG_H_
+#endif  // AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
diff --git a/third_party/aom/av1/common/av1_loopfilter.c b/third_party/aom/av1/common/av1_loopfilter.c
index 9d68b8760..537d8dfe9 100644
--- a/third_party/aom/av1/common/av1_loopfilter.c
+++ b/third_party/aom/av1/common/av1_loopfilter.c
@@ -68,23 +68,6 @@ static const int mode_lf_lut[] = {
 //    10101010|10101010
 //
 // A loopfilter should be applied to every other 4x4 horizontally.
-// TODO(chengchen): make these tables static
-const FilterMask left_txform_mask[TX_SIZES] = {
-  { { 0xffffffffffffffffULL,  // TX_4X4,
-      0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
-
-  { { 0x5555555555555555ULL,  // TX_8X8,
-      0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL } },
-
-  { { 0x1111111111111111ULL,  // TX_16X16,
-      0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL } },
-
-  { { 0x0101010101010101ULL,  // TX_32X32,
-      0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL } },
-
-  { { 0x0001000100010001ULL,  // TX_64X64,
-      0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } },
-};
 
 // 256 bit masks (64x64 / 4x4) for above transform size for Y plane.
 // We use 4 uint64_t to represent the 256 bit.
@@ -113,98 +96,314 @@ const FilterMask left_txform_mask[TX_SIZES] = {
 //    00000000|00000000
 //
 // A loopfilter should be applied to every other 4x4 horizontally.
-const FilterMask above_txform_mask[TX_SIZES] = {
-  { { 0xffffffffffffffffULL,  // TX_4X4
-      0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
 
-  { { 0x0000ffff0000ffffULL,  // TX_8X8
-      0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL } },
-
-  { { 0x000000000000ffffULL,  // TX_16X16
-      0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL } },
-
-  { { 0x000000000000ffffULL,  // TX_32X32
-      0x0000000000000000ULL, 0x000000000000ffffULL, 0x0000000000000000ULL } },
-
-  { { 0x000000000000ffffULL,  // TX_64X64
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18
 };
 
-// 64 bit mask to shift and set for each prediction size. A bit is set for
-// each 4x4 block that would be in the top left most block of the given block
-// size in the 64x64 block.
-const FilterMask size_mask_y[BLOCK_SIZES_ALL] = {
-  { { 0x0000000000000001ULL,  // BLOCK_4X4
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x0000000000010001ULL,  // BLOCK_4X8
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x0000000000000003ULL,  // BLOCK_8X4
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x0000000000030003ULL,  // BLOCK_8X8
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x0003000300030003ULL,  // BLOCK_8X16
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x00000000000f000fULL,  // BLOCK_16X8
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x000f000f000f000fULL,  // BLOCK_16X16
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x000f000f000f000fULL,  // BLOCK_16X32
-      0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x00ff00ff00ff00ffULL,  // BLOCK_32X16
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x00ff00ff00ff00ffULL,  // BLOCK_32X32
-      0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x00ff00ff00ff00ffULL,  // BLOCK_32X64
-      0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL } },
-
-  { { 0xffffffffffffffffULL,  // BLOCK_64X32
-      0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0xffffffffffffffffULL,  // BLOCK_64X64
-      0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
-  // Y plane max coding block size is 128x128, but the codec divides it
-  // into 4 64x64 blocks.
-  // BLOCK_64X128
-  { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
-  // BLOCK_128X64
-  { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
-  // BLOCK_128X128
-  { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
-
-  { { 0x0001000100010001ULL,  // BLOCK_4X16
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x000000000000000fULL,  // BLOCK_16X4
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x0003000300030003ULL,  // BLOCK_8X32
-      0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = {
+  -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13
+};
 
-  { { 0x0000000000ff00ffULL,  // BLOCK_32X8
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = {
+  -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8
+};
 
-  { { 0x000f000f000f000fULL,  // BLOCK_16X64
-      0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL } },
+const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = { -1, -1, -1, -1, -1, -1,
+                                                      -1, -1, -1, 0,  1,  2,
+                                                      3,  -1, -1, -1, -1, -1,
+                                                      -1, -1, -1, -1 };
+
+const FilterMask left_mask_univariant_reordered[67] = {
+  // TX_4X4
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
+  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
+  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
+  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+      0xffffffffffffffffULL } },  // block size 64X64, TX_4X4
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
+  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
+  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
+  // TX_8X8
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
+  { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
+  { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
+  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
+  { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
+  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
+  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL,
+      0x0055005500550055ULL } },  // block size 32X64, TX_8X8
+  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
+  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL,
+      0x5555555555555555ULL } },  // block size 64X64, TX_8X8
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
+  { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
+  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL,
+      0x0005000500050005ULL } },  // block size 16X64, TX_8X8
+  { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
+  // TX_16X16
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
+  { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
+  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
+  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL,
+      0x0011001100110011ULL } },  // block size 32X64, TX_16X16
+  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
+  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL,
+      0x1111111111111111ULL } },  // block size 64X64, TX_16X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 16X64, TX_16X16
+  { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
+  // TX_32X32
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
+  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+      0x0101010101010101ULL } },  // block size 32X64, TX_32X32
+  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
+  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+      0x0101010101010101ULL } },  // block size 64X64, TX_32X32
+  // TX_64X64
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 64X64, TX_64X64
+  // 2:1, 1:2 transform sizes.
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
+  { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
+  { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 16X64, TX_16X32
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
+  { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 32X64, TX_32X64
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
+  // 4:1, 1:4 transform sizes.
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 16X64, TX_16X64
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
+};
 
-  { { 0xffffffffffffffffULL,  // BLOCK_64X16
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }
+const FilterMask above_mask_univariant_reordered[67] = {
+  // TX_4X4
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
+  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
+  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
+  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+      0xffffffffffffffffULL } },  // block size 64X64, TX_4x4
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
+  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
+  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
+  // TX_8X8
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
+  { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
+  { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
+  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
+  { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
+  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
+  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL,
+      0x000000ff000000ffULL } },  // block size 32X64, TX_8X8
+  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
+  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
+      0x0000ffff0000ffffULL } },  // block size 64X64, TX_8X8
+  { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
+  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL,
+      0x0000000f0000000fULL } },  // block size 16X64, TX_8X8
+  { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
+  // TX_16X16
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
+  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
+  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
+  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL,
+      0x00000000000000ffULL } },  // block size 32X64, TX_16X16
+  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
+  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL,
+      0x000000000000ffffULL } },  // block size 64X64, TX_16X16
+  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL,
+      0x000000000000000fULL } },  // block size 16X64, TX_16X16
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
+  // TX_32X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL,
+      0x0000000000000000ULL } },  // block size 32X64, TX_32X32
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL,
+      0x0000000000000000ULL } },  // block size 64X64, TX_32X32
+  // TX_64X64
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X64, TX_64X64
+  // 2:1, 1:2 transform sizes.
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
+  { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
+  { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL,
+      0x0000000000000000ULL } },  // block size 16X64, TX_16X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X64, TX_32X64
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
+  // 4:1, 1:4 transform sizes.
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X64, TX_16X64
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
 };
 
 LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm, int mi_row,
                                      int mi_col) {
-  if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
-      (mi_col << MI_SIZE_LOG2) >= cm->width)
-    return NULL;
   assert(cm->lf.lfm != NULL);
   const int row = mi_row >> MIN_MIB_SIZE_LOG2;  // 64x64
   const int col = mi_col >> MIN_MIB_SIZE_LOG2;
@@ -248,10 +447,10 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
            SIMD_WIDTH);
   }
 }
-static uint8_t get_filter_level(const AV1_COMMON *cm,
-                                const loop_filter_info_n *lfi_n,
-                                const int dir_idx, int plane,
-                                const MB_MODE_INFO *mbmi) {
+
+uint8_t get_filter_level(const AV1_COMMON *cm, const loop_filter_info_n *lfi_n,
+                         const int dir_idx, int plane,
+                         const MB_MODE_INFO *mbmi) {
   const int segment_id = mbmi->segment_id;
   if (cm->delta_lf_present_flag) {
     int delta_lf;
@@ -374,30 +573,6 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
       }
     }
   }
-
-#if LOOP_FILTER_BITMASK
-  memset(lf->neighbor_sb_lpf_info.tx_size_y_above, TX_64X64,
-         sizeof(TX_SIZE) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.tx_size_y_left, TX_64X64,
-         sizeof(TX_SIZE) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.tx_size_uv_above, TX_64X64,
-         sizeof(TX_SIZE) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.tx_size_uv_left, TX_64X64,
-         sizeof(TX_SIZE) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.y_level_above, 0,
-         sizeof(uint8_t) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.y_level_left, 0,
-         sizeof(uint8_t) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.u_level_above, 0,
-         sizeof(uint8_t) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.u_level_left, 0,
-         sizeof(uint8_t) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.v_level_above, 0,
-         sizeof(uint8_t) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.v_level_left, 0,
-         sizeof(uint8_t) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.skip, 0, sizeof(uint8_t) * MI_SIZE_64X64);
-#endif  // LOOP_FILTER_BITMASK
 }
 
 #if LOOP_FILTER_BITMASK
@@ -413,7 +588,7 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
 // After locating which uint64_t, mi_row % 4 is the
 // row offset, and each row has 16 = 1 << stride_log2 4x4 units.
 // Therefore, shift = (row << stride_log2) + mi_col;
-static int get_index_shift(int mi_col, int mi_row, int *index) {
+int get_index_shift(int mi_col, int mi_row, int *index) {
   // *index = mi_row >> 2;
   // rows = mi_row % 4;
   // stride_log2 = 4;
@@ -588,15 +763,9 @@ static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
           else
             lfm->lfl_y_hor[row][col] = level;
         } else if (plane == 1) {
-          if (dir == VERT_EDGE)
-            lfm->lfl_u_ver[row][col] = level;
-          else
-            lfm->lfl_u_hor[row][col] = level;
+          lfm->lfl_u[row][col] = level;
         } else {
-          if (dir == VERT_EDGE)
-            lfm->lfl_v_ver[row][col] = level;
-          else
-            lfm->lfl_v_hor[row][col] = level;
+          lfm->lfl_v[row][col] = level;
         }
       }
     }
@@ -623,11 +792,12 @@ static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
           const TX_SIZE prev_tx_size =
               plane ? av1_get_max_uv_txsize(mbmi_prev->sb_type, ssx, ssy)
                     : mbmi_prev->tx_size;
-          const TX_SIZE min_tx_size =
-              (dir == VERT_EDGE) ? AOMMIN(txsize_horz_map[tx_size],
-                                          txsize_horz_map[prev_tx_size])
-                                 : AOMMIN(txsize_vert_map[tx_size],
-                                          txsize_vert_map[prev_tx_size]);
+          TX_SIZE min_tx_size = (dir == VERT_EDGE)
+                                    ? AOMMIN(txsize_horz_map[tx_size],
+                                             txsize_horz_map[prev_tx_size])
+                                    : AOMMIN(txsize_vert_map[tx_size],
+                                             txsize_vert_map[prev_tx_size]);
+          min_tx_size = AOMMIN(min_tx_size, TX_16X16);
           assert(min_tx_size < TX_SIZES);
           const int row = r % MI_SIZE_64X64;
           const int col = c % MI_SIZE_64X64;
@@ -883,13 +1053,11 @@ void av1_setup_bitmask(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
       } else if (plane == 1) {
         av1_zero(lfm->left_u);
         av1_zero(lfm->above_u);
-        av1_zero(lfm->lfl_u_ver);
-        av1_zero(lfm->lfl_u_hor);
+        av1_zero(lfm->lfl_u);
       } else {
         av1_zero(lfm->left_v);
         av1_zero(lfm->above_v);
-        av1_zero(lfm->lfl_v_ver);
-        av1_zero(lfm->lfl_v_hor);
+        av1_zero(lfm->lfl_v);
       }
     }
   }
@@ -979,13 +1147,10 @@ static void filter_selectively_vert_row2(
 
         if ((mask_16x16_0 & mask_16x16_1) & 1) {
           if (plane) {
-            // TODO(any): add aom_lpf_vertical_6_dual for chroma plane.
-            aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-            aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                               lfi1->hev_thr);
+            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                    lfi1->hev_thr);
           } else {
-            // TODO(any): add dual function simd function. Current sse2 code
-            // just called aom_lpf_vertical_14_sse2 twice.
             aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                      lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                      lfi1->hev_thr);
@@ -1005,9 +1170,9 @@ static void filter_selectively_vert_row2(
 
         if ((mask_8x8_0 & mask_8x8_1) & 1) {
           if (plane) {
-            aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-            aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                               lfi1->hev_thr);
+            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                    lfi1->hev_thr);
           } else {
             aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                     lfi0->hev_thr, lfi1->mblim, lfi1->lim,
@@ -1070,10 +1235,9 @@ static void highbd_filter_selectively_vert_row2(
 
         if ((mask_16x16_0 & mask_16x16_1) & 1) {
           if (plane) {
-            aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim,
-                                      lfi0->hev_thr, bd);
-            aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim,
-                                      lfi1->lim, lfi1->hev_thr, bd);
+            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                           lfi0->hev_thr, lfi1->mblim,
+                                           lfi1->lim, lfi1->hev_thr, bd);
           } else {
             aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                             lfi0->hev_thr, lfi1->mblim,
@@ -1094,10 +1258,9 @@ static void highbd_filter_selectively_vert_row2(
 
         if ((mask_8x8_0 & mask_8x8_1) & 1) {
           if (plane) {
-            aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim,
-                                      lfi0->hev_thr, bd);
-            aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim,
-                                      lfi1->lim, lfi1->hev_thr, bd);
+            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                           lfi0->hev_thr, lfi1->mblim,
+                                           lfi1->lim, lfi1->hev_thr, bd);
           } else {
             aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                            lfi0->hev_thr, lfi1->mblim,
@@ -1163,13 +1326,15 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
             plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14;
 
         if ((mask_16x16 & two_block_mask) == two_block_mask) {
-          /*
-          aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
-                                     lfi->hev_thr);
-          */
-
-          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-          lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, lfin->hev_thr);
+          if (plane) {
+            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr);
+          } else {
+            aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, lfin->mblim, lfin->lim,
+                                       lfin->hev_thr);
+          }
           count = 2;
         } else {
           lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
@@ -1181,28 +1346,24 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
             plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8;
 
         if ((mask_8x8 & two_block_mask) == two_block_mask) {
-          /*
-          aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, lfin->mblim, lfin->lim,
-                                    lfin->hev_thr);
-          */
-
-          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-          lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, lfin->hev_thr);
+          if (plane) {
+            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr);
+          } else {
+            aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr);
+          }
           count = 2;
         } else {
           lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
         }
       } else if (mask_4x4 & 1) {
         if ((mask_4x4 & two_block_mask) == two_block_mask) {
-          /*
           aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, lfin->mblim, lfin->lim,
                                     lfin->hev_thr);
-          */
-          aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-          aom_lpf_horizontal_4(s + 4, pitch, lfin->mblim, lfin->lim,
-                               lfin->hev_thr);
           count = 2;
         } else {
           aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
@@ -1239,15 +1400,15 @@ static void highbd_filter_selectively_horiz(
             plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14;
 
         if ((mask_16x16 & two_block_mask) == two_block_mask) {
-          /*
-          aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
-                                            lfi->hev_thr, bd);
-          */
-
-          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                                bd);
-          highbd_lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim,
-                                lfin->hev_thr, bd);
+          if (plane) {
+            aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+                                             lfi->hev_thr, lfin->mblim,
+                                             lfin->lim, lfin->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+                                              lfi->hev_thr, lfin->mblim,
+                                              lfin->lim, lfin->hev_thr, bd);
+          }
           count = 2;
         } else {
           highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
@@ -1258,15 +1419,15 @@ static void highbd_filter_selectively_horiz(
             plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8;
 
         if ((mask_8x8 & two_block_mask) == two_block_mask) {
-          /*
-          aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
-                                           lfi->hev_thr, lfin->mblim, lfin->lim,
-                                           lfin->hev_thr, bd);
-          */
-          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                                bd);
-          highbd_lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim,
-                                lfin->hev_thr, bd);
+          if (plane) {
+            aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+                                             lfi->hev_thr, lfin->mblim,
+                                             lfin->lim, lfin->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                             lfi->hev_thr, lfin->mblim,
+                                             lfin->lim, lfin->hev_thr, bd);
+          }
           count = 2;
         } else {
           highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
@@ -1274,15 +1435,9 @@ static void highbd_filter_selectively_horiz(
         }
       } else if (mask_4x4 & 1) {
         if ((mask_4x4 & two_block_mask) == two_block_mask) {
-          /*
           aom_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                            lfi->hev_thr, lfin->mblim, lfin->lim,
                                            lfin->hev_thr, bd);
-          */
-          aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, bd);
-          aom_highbd_lpf_horizontal_4(s + 4, pitch, lfin->mblim, lfin->lim,
-                                      lfin->hev_thr, bd);
           count = 2;
         } else {
           aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
@@ -1299,43 +1454,289 @@ static void highbd_filter_selectively_horiz(
   }
 }
 
-static int compare_ref_dst(AV1_COMMON *const cm, uint8_t *ref_buf,
-                           uint8_t *dst_buf, int ref_stride, int dst_stride,
-                           int start, int end) {
-  return 0;
-
-  start <<= MI_SIZE_LOG2;
-  end <<= MI_SIZE_LOG2;
-  uint8_t *ref0 = ref_buf;
-  uint8_t *dst0 = dst_buf;
-  if (cm->seq_params.use_highbitdepth) {
-    const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref_buf);
-    const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst_buf);
-    for (int j = 0; j < 4; ++j) {
-      for (int i = start; i < end; ++i)
-        if (ref16[i] != dst16[i]) {
-          ref_buf = ref0;
-          dst_buf = dst0;
-          return i + 1;
+void av1_build_bitmask_vert_info(
+    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+    int plane) {
+  const int subsampling_x = plane_ptr->subsampling_x;
+  const int subsampling_y = plane_ptr->subsampling_y;
+  const int row_step = (MI_SIZE >> MI_SIZE_LOG2);
+  const int is_uv = plane > 0;
+  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+  uint8_t level, prev_level = 1;
+  int skip, prev_skip = 0;
+  int is_coding_block_border;
+
+  for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r += row_step) {
+    const int mi_row = r << subsampling_y;
+    const int row = mi_row % MI_SIZE_64X64;
+    int index = 0;
+    const int shift = get_index_shift(0, row, &index);
+
+    for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width;
+         c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) {
+      const int mi_col = c << subsampling_x;
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+      for (int col_in_unit = 0;
+           col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) {
+        const int x = (c + col_in_unit) << MI_SIZE_LOG2;
+        if (x >= plane_ptr->dst.width) break;
+        const int col = col_in_unit << subsampling_x;
+        const uint64_t mask = ((uint64_t)1 << (shift | col));
+        skip = lfm->skip.bits[index] & mask;
+        is_coding_block_border = lfm->is_vert_border.bits[index] & mask;
+        switch (plane) {
+          case 0: level = lfm->lfl_y_ver[row][col]; break;
+          case 1: level = lfm->lfl_u[row][col]; break;
+          case 2: level = lfm->lfl_v[row][col]; break;
+          default: assert(plane >= 0 && plane <= 2); return;
         }
-      ref16 += ref_stride;
-      dst16 += dst_stride;
+        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+          if (is_uv && ts == TX_64X64) continue;
+          if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) {
+            tx_size = ts;
+            break;
+          }
+        }
+        if ((c + col_in_unit > 0) && (level || prev_level) &&
+            (!prev_skip || !skip || is_coding_block_border)) {
+          const TX_SIZE min_tx_size =
+              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+          const int tmp_row = (mi_row | subsampling_y) % MI_SIZE_64X64;
+          const int tmp_col = (col | subsampling_x) % MI_SIZE_64X64;
+          const int shift_1 = get_index_shift(tmp_col, tmp_row, &index);
+          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+          switch (plane) {
+            case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break;
+            case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break;
+            case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break;
+            default: assert(plane >= 0 && plane <= 2); return;
+          }
+        }
+
+        // update prev info
+        prev_level = level;
+        prev_skip = skip;
+        prev_tx_size = tx_size;
+        // advance
+        col_in_unit += tx_size_wide_unit[tx_size];
+      }
     }
-  } else {
-    for (int j = 0; j < 4; ++j) {
-      for (int i = start; i < end; ++i)
-        if (ref_buf[i] != dst_buf[i]) {
-          ref_buf = ref0;
-          dst_buf = dst0;
-          return i + 1;
+  }
+}
+
+void av1_build_bitmask_horz_info(
+    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+    int plane) {
+  const int subsampling_x = plane_ptr->subsampling_x;
+  const int subsampling_y = plane_ptr->subsampling_y;
+  const int col_step = (MI_SIZE >> MI_SIZE_LOG2);
+  const int is_uv = plane > 0;
+  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+  uint8_t level, prev_level = 1;
+  int skip, prev_skip = 0;
+  int is_coding_block_border;
+
+  for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c += col_step) {
+    const int mi_col = c << subsampling_x;
+    const int col = mi_col % MI_SIZE_64X64;
+
+    for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height;
+         r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) {
+      const int mi_row = r << subsampling_y;
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+      for (int r_in_unit = 0;
+           r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) {
+        const int y = (r + r_in_unit) << MI_SIZE_LOG2;
+        if (y >= plane_ptr->dst.height) break;
+        const int row = r_in_unit << subsampling_y;
+        int index = 0;
+        const int shift = get_index_shift(col, row, &index);
+        const uint64_t mask = ((uint64_t)1 << shift);
+        skip = lfm->skip.bits[index] & mask;
+        is_coding_block_border = lfm->is_horz_border.bits[index] & mask;
+        switch (plane) {
+          case 0: level = lfm->lfl_y_hor[row][col]; break;
+          case 1: level = lfm->lfl_u[row][col]; break;
+          case 2: level = lfm->lfl_v[row][col]; break;
+          default: assert(plane >= 0 && plane <= 2); return;
         }
-      ref_buf += ref_stride;
-      dst_buf += dst_stride;
+        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+          if (is_uv && ts == TX_64X64) continue;
+          if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) {
+            tx_size = ts;
+            break;
+          }
+        }
+        if ((r + r_in_unit > 0) && (level || prev_level) &&
+            (!prev_skip || !skip || is_coding_block_border)) {
+          const TX_SIZE min_tx_size =
+              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+          const int tmp_row = (row | subsampling_y) % MI_SIZE_64X64;
+          const int tmp_col = (mi_col | subsampling_x) % MI_SIZE_64X64;
+          const int shift_1 = get_index_shift(tmp_col, tmp_row, &index);
+          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+
+          switch (plane) {
+            case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break;
+            case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break;
+            case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break;
+            default: assert(plane >= 0 && plane <= 2); return;
+          }
+        }
+
+        // update prev info
+        prev_level = level;
+        prev_skip = skip;
+        prev_tx_size = tx_size;
+        // advance
+        r_in_unit += tx_size_high_unit[tx_size];
+      }
+    }
+  }
+}
+
+void av1_filter_block_plane_bitmask_vert(
+    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+    int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  uint8_t *const buf0 = dst->buf;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int row_step = 1 << ssy;
+  const int two_row_step = 2 << ssy;
+  const int row_stride = dst->stride << MI_SIZE_LOG2;
+  const int two_row_stride = row_stride << 1;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+  uint8_t *lfl2;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  assert(lfm);
+
+  // 1. vertical filtering. filter two rows at a time
+  for (int r = 0;
+       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+       r += two_row_step) {
+    const int row = r | ssy;
+    const int row_next = row + row_step;
+    const int col = ssx;
+    int index = 0;
+    const int shift = get_index_shift(col, row, &index);
+    int index_next = 0;
+    const int shift_next = get_index_shift(col, row_next, &index_next);
+    switch (pl) {
+      case 0:
+        mask_16x16 = lfm->left_y[TX_16X16].bits[index];
+        mask_8x8 = lfm->left_y[TX_8X8].bits[index];
+        mask_4x4 = lfm->left_y[TX_4X4].bits[index];
+        lfl = &lfm->lfl_y_ver[row][col];
+        lfl2 = &lfm->lfl_y_ver[row_next][col];
+        break;
+      case 1:
+        mask_16x16 = lfm->left_u[TX_16X16].bits[index];
+        mask_8x8 = lfm->left_u[TX_8X8].bits[index];
+        mask_4x4 = lfm->left_u[TX_4X4].bits[index];
+        lfl = &lfm->lfl_u[row][col];
+        lfl2 = &lfm->lfl_u[row_next][col];
+        break;
+      case 2:
+        mask_16x16 = lfm->left_v[TX_16X16].bits[index];
+        mask_8x8 = lfm->left_v[TX_8X8].bits[index];
+        mask_4x4 = lfm->left_v[TX_4X4].bits[index];
+        lfl = &lfm->lfl_v[row][col];
+        lfl2 = &lfm->lfl_v[row_next][col];
+        break;
+      default: assert(pl >= 0 && pl <= 2); return;
+    }
+    uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
+    uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
+    uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
+    uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
+    uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
+    uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
+
+    if (cm->seq_params.use_highbitdepth)
+      highbd_filter_selectively_vert_row2(
+          ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+          mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+          &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
+    else
+      filter_selectively_vert_row2(
+          ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
+          mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
+    dst->buf += two_row_stride;
+  }
+  // reset buf pointer for horizontal filtering
+  dst->buf = buf0;
+}
+
+void av1_filter_block_plane_bitmask_horz(
+    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+    int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  uint8_t *const buf0 = dst->buf;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int row_step = 1 << ssy;
+  const int row_stride = dst->stride << MI_SIZE_LOG2;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  assert(lfm);
+  for (int r = 0;
+       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+       r += row_step) {
+    if (mi_row + r == 0) {
+      dst->buf += row_stride;
+      continue;
     }
+    const int row = r | ssy;
+    const int col = ssx;
+    int index = 0;
+    const int shift = get_index_shift(col, row, &index);
+    switch (pl) {
+      case 0:
+        mask_16x16 = lfm->above_y[TX_16X16].bits[index];
+        mask_8x8 = lfm->above_y[TX_8X8].bits[index];
+        mask_4x4 = lfm->above_y[TX_4X4].bits[index];
+        lfl = &lfm->lfl_y_hor[row][col];
+        break;
+      case 1:
+        mask_16x16 = lfm->above_u[TX_16X16].bits[index];
+        mask_8x8 = lfm->above_u[TX_8X8].bits[index];
+        mask_4x4 = lfm->above_u[TX_4X4].bits[index];
+        lfl = &lfm->lfl_u[row][col];
+        break;
+      case 2:
+        mask_16x16 = lfm->above_v[TX_16X16].bits[index];
+        mask_8x8 = lfm->above_v[TX_8X8].bits[index];
+        mask_4x4 = lfm->above_v[TX_4X4].bits[index];
+        lfl = &lfm->lfl_v[row][col];
+        break;
+      default: assert(pl >= 0 && pl <= 2); return;
+    }
+    mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
+    mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
+    mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
+
+    if (cm->seq_params.use_highbitdepth)
+      highbd_filter_selectively_horiz(
+          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
+          mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
+    else
+      filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+                               mask_8x8, mask_4x4, &cm->lf_info, lfl);
+    dst->buf += row_stride;
   }
-  ref_buf = ref0;
-  dst_buf = dst0;
-  return 0;
+  // reset buf pointer for next block
+  dst->buf = buf0;
 }
 
 void av1_filter_block_plane_ver(AV1_COMMON *const cm,
@@ -1385,15 +1786,15 @@ void av1_filter_block_plane_ver(AV1_COMMON *const cm,
           mask_16x16 = lfm->left_u[TX_16X16].bits[index];
           mask_8x8 = lfm->left_u[TX_8X8].bits[index];
           mask_4x4 = lfm->left_u[TX_4X4].bits[index];
-          lfl = &lfm->lfl_u_ver[row][col];
-          lfl2 = &lfm->lfl_u_ver[row_next][col];
+          lfl = &lfm->lfl_u[row][col];
+          lfl2 = &lfm->lfl_u[row_next][col];
           break;
         case 2:
           mask_16x16 = lfm->left_v[TX_16X16].bits[index];
           mask_8x8 = lfm->left_v[TX_8X8].bits[index];
           mask_4x4 = lfm->left_v[TX_4X4].bits[index];
-          lfl = &lfm->lfl_v_ver[row][col];
-          lfl2 = &lfm->lfl_v_ver[row_next][col];
+          lfl = &lfm->lfl_v[row][col];
+          lfl2 = &lfm->lfl_v[row_next][col];
           break;
         default: assert(pl >= 0 && pl <= 2); return;
       }
@@ -1460,13 +1861,13 @@ void av1_filter_block_plane_hor(AV1_COMMON *const cm,
           mask_16x16 = lfm->above_u[TX_16X16].bits[index];
           mask_8x8 = lfm->above_u[TX_8X8].bits[index];
           mask_4x4 = lfm->above_u[TX_4X4].bits[index];
-          lfl = &lfm->lfl_u_hor[row][col];
+          lfl = &lfm->lfl_u[row][col];
           break;
         case 2:
           mask_16x16 = lfm->above_v[TX_16X16].bits[index];
           mask_8x8 = lfm->above_v[TX_8X8].bits[index];
           mask_4x4 = lfm->above_v[TX_4X4].bits[index];
-          lfl = &lfm->lfl_v_hor[row][col];
+          lfl = &lfm->lfl_v[row][col];
           break;
         default: assert(pl >= 0 && pl <= 2); return;
       }
@@ -1820,6 +2221,9 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
 
 static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
                              MACROBLOCKD *xd, int start, int stop,
+#if LOOP_FILTER_BITMASK
+                             int is_decoding,
+#endif
                              int plane_start, int plane_end) {
   struct macroblockd_plane *pd = xd->plane;
   const int col_start = 0;
@@ -1827,6 +2231,45 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
   int mi_row, mi_col;
   int plane;
 
+#if LOOP_FILTER_BITMASK
+  if (is_decoding) {
+    for (plane = plane_start; plane < plane_end; plane++) {
+      if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
+        break;
+      else if (plane == 1 && !(cm->lf.filter_level_u))
+        continue;
+      else if (plane == 2 && !(cm->lf.filter_level_v))
+        continue;
+
+      av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, 0, 0,
+                           plane, plane + 1);
+      av1_build_bitmask_vert_info(cm, &pd[plane], plane);
+      av1_build_bitmask_horz_info(cm, &pd[plane], plane);
+
+      // apply loop filtering which only goes through buffer once
+      for (mi_row = start; mi_row < stop; mi_row += MI_SIZE_64X64) {
+        for (mi_col = col_start; mi_col < col_end; mi_col += MI_SIZE_64X64) {
+          av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row, mi_col,
+                               plane, plane + 1);
+          av1_filter_block_plane_bitmask_vert(cm, &pd[plane], plane, mi_row,
+                                              mi_col);
+          if (mi_col - MI_SIZE_64X64 >= 0) {
+            av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row,
+                                 mi_col - MI_SIZE_64X64, plane, plane + 1);
+            av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
+                                                mi_col - MI_SIZE_64X64);
+          }
+        }
+        av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row,
+                             mi_col - MI_SIZE_64X64, plane, plane + 1);
+        av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
+                                            mi_col - MI_SIZE_64X64);
+      }
+    }
+    return;
+  }
+#endif
+
   for (plane = plane_start; plane < plane_end; plane++) {
     if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
       break;
@@ -1910,8 +2353,11 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
 }
 
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
-                           MACROBLOCKD *xd, int plane_start, int plane_end,
-                           int partial_frame) {
+                           MACROBLOCKD *xd,
+#if LOOP_FILTER_BITMASK
+                           int is_decoding,
+#endif
+                           int plane_start, int plane_end, int partial_frame) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
 
   start_mi_row = 0;
@@ -1923,6 +2369,9 @@ void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
   }
   end_mi_row = start_mi_row + mi_rows_to_filter;
   av1_loop_filter_frame_init(cm, plane_start, plane_end);
-  loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
-                   plane_end);
+  loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row,
+#if LOOP_FILTER_BITMASK
+                   is_decoding,
+#endif
+                   plane_start, plane_end);
 }
diff --git a/third_party/aom/av1/common/av1_loopfilter.h b/third_party/aom/av1/common/av1_loopfilter.h
index c35c3b2dc..80ac61178 100644
--- a/third_party/aom/av1/common/av1_loopfilter.h
+++ b/third_party/aom/av1/common/av1_loopfilter.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_LOOPFILTER_H_
-#define AV1_COMMON_LOOPFILTER_H_
+#ifndef AOM_AV1_COMMON_AV1_LOOPFILTER_H_
+#define AOM_AV1_COMMON_AV1_LOOPFILTER_H_
 
 #include "config/aom_config.h"
 
@@ -60,51 +60,20 @@ typedef struct {
   uint8_t lfl_y_hor[MI_SIZE_64X64][MI_SIZE_64X64];
   uint8_t lfl_y_ver[MI_SIZE_64X64][MI_SIZE_64X64];
 
-  // U plane vertical edge and horizontal edge filter level
-  uint8_t lfl_u_hor[MI_SIZE_64X64][MI_SIZE_64X64];
-  uint8_t lfl_u_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+  // U plane filter level
+  uint8_t lfl_u[MI_SIZE_64X64][MI_SIZE_64X64];
 
-  // V plane vertical edge and horizontal edge filter level
-  uint8_t lfl_v_hor[MI_SIZE_64X64][MI_SIZE_64X64];
-  uint8_t lfl_v_ver[MI_SIZE_64X64][MI_SIZE_64X64];
-} LoopFilterMask;
+  // V plane filter level
+  uint8_t lfl_v[MI_SIZE_64X64][MI_SIZE_64X64];
 
-// To determine whether to apply loop filtering at one transform block edge,
-// we need information of the neighboring transform block. Specifically,
-// in determining a vertical edge, we need the information of the tx block
-// to its left. For a horizontal edge, we need info of the tx block above it.
-// Thus, we need to record info of right column and bottom row of tx blocks.
-// We record the information of the neighboring superblock, when bitmask
-// building for a superblock is finished. And it will be used for next
-// superblock bitmask building.
-// Information includes:
-// ------------------------------------------------------------
-//                    MI_SIZE_64X64
-// Y  tx_size above |--------------|
-// Y  tx_size left  |--------------|
-// UV tx_size above |--------------|
-// UV tx_size left  |--------------|
-// Y level above    |--------------|
-// Y level left     |--------------|
-// U level above    |--------------|
-// U level left     |--------------|
-// V level above    |--------------|
-// V level left     |--------------|
-// skip             |--------------|
-// ------------------------------------------------------------
-typedef struct {
-  TX_SIZE tx_size_y_above[MI_SIZE_64X64];
-  TX_SIZE tx_size_y_left[MI_SIZE_64X64];
-  TX_SIZE tx_size_uv_above[MI_SIZE_64X64];
-  TX_SIZE tx_size_uv_left[MI_SIZE_64X64];
-  uint8_t y_level_above[MI_SIZE_64X64];
-  uint8_t y_level_left[MI_SIZE_64X64];
-  uint8_t u_level_above[MI_SIZE_64X64];
-  uint8_t u_level_left[MI_SIZE_64X64];
-  uint8_t v_level_above[MI_SIZE_64X64];
-  uint8_t v_level_left[MI_SIZE_64X64];
-  uint8_t skip[MI_SIZE_64X64];
-} LpfSuperblockInfo;
+  // other info
+  FilterMask skip;
+  FilterMask is_vert_border;
+  FilterMask is_horz_border;
+  // Y or UV planes, 5 tx sizes: 4x4, 8x8, 16x16, 32x32, 64x64
+  FilterMask tx_size_ver[2][5];
+  FilterMask tx_size_hor[2][5];
+} LoopFilterMask;
 #endif  // LOOP_FILTER_BITMASK
 
 struct loopfilter {
@@ -130,7 +99,6 @@ struct loopfilter {
   LoopFilterMask *lfm;
   size_t lfm_num;
   int lfm_stride;
-  LpfSuperblockInfo neighbor_sb_lpf_info;
 #endif  // LOOP_FILTER_BITMASK
 };
 
@@ -157,9 +125,15 @@ void av1_loop_filter_init(struct AV1Common *cm);
 void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start,
                                 int plane_end);
 
+#if LOOP_FILTER_BITMASK
+void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+                           struct macroblockd *mbd, int is_decoding,
+                           int plane_start, int plane_end, int partial_frame);
+#else
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
                            struct macroblockd *mbd, int plane_start,
                            int plane_end, int partial_frame);
+#endif
 
 void av1_filter_block_plane_vert(const struct AV1Common *const cm,
                                  const MACROBLOCKD *const xd, const int plane,
@@ -180,6 +154,9 @@ typedef struct LoopFilterWorkerData {
   MACROBLOCKD *xd;
 } LFWorkerData;
 
+uint8_t get_filter_level(const struct AV1Common *cm,
+                         const loop_filter_info_n *lfi_n, const int dir_idx,
+                         int plane, const MB_MODE_INFO *mbmi);
 #if LOOP_FILTER_BITMASK
 void av1_setup_bitmask(struct AV1Common *const cm, int mi_row, int mi_col,
                        int plane, int subsampling_x, int subsampling_y,
@@ -192,10 +169,59 @@ void av1_filter_block_plane_ver(struct AV1Common *const cm,
 void av1_filter_block_plane_hor(struct AV1Common *const cm,
                                 struct macroblockd_plane *const plane, int pl,
                                 int mi_row, int mi_col);
+LoopFilterMask *get_loop_filter_mask(const struct AV1Common *const cm,
+                                     int mi_row, int mi_col);
+int get_index_shift(int mi_col, int mi_row, int *index);
+
+static const FilterMask left_txform_mask[TX_SIZES] = {
+  { { 0x0000000000000001ULL,  // TX_4X4,
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+  { { 0x0000000000010001ULL,  // TX_8X8,
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+  { { 0x0001000100010001ULL,  // TX_16X16,
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+  { { 0x0001000100010001ULL,  // TX_32X32,
+      0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+  { { 0x0001000100010001ULL,  // TX_64X64,
+      0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } },
+};
+
+static const uint64_t above_txform_mask[2][TX_SIZES] = {
+  {
+      0x0000000000000001ULL,  // TX_4X4
+      0x0000000000000003ULL,  // TX_8X8
+      0x000000000000000fULL,  // TX_16X16
+      0x00000000000000ffULL,  // TX_32X32
+      0x000000000000ffffULL,  // TX_64X64
+  },
+  {
+      0x0000000000000001ULL,  // TX_4X4
+      0x0000000000000005ULL,  // TX_8X8
+      0x0000000000000055ULL,  // TX_16X16
+      0x0000000000005555ULL,  // TX_32X32
+      0x0000000055555555ULL,  // TX_64X64
+  },
+};
+
+extern const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL];
+
+extern const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL];
+
+extern const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL];
+
+extern const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL];
+
+extern const FilterMask left_mask_univariant_reordered[67];
+
+extern const FilterMask above_mask_univariant_reordered[67];
 #endif
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_LOOPFILTER_H_
+#endif  // AOM_AV1_COMMON_AV1_LOOPFILTER_H_
diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl
index fa8b34981..dee1f1c79 100755
--- a/third_party/aom/av1/common/av1_rtcd_defs.pl
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@@ -76,12 +76,12 @@ specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
 specialize qw/av1_highbd_wiener_convolve_add_src ssse3/;
 specialize qw/av1_highbd_wiener_convolve_add_src avx2/;
 
+
 # directional intra predictor functions
 add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
 add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
 add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
 
-
 # FILTER_INTRA predictor functions
 add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
 specialize qw/av1_filter_intra_predictor sse4_1/;
@@ -108,6 +108,22 @@ specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
 add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
 specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
 
+add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/;
+
+add_proto qw/void av1_highbd_inv_txfm_add_4x4/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x8/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x8/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x8 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x16/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x16 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x16/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x16 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_32x32/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x32 sse4_1 avx2/;
+
 add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 
@@ -122,9 +138,7 @@ specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/;
 add_proto qw/void av1_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 specialize qw/av1_inv_txfm2d_add_8x8 sse4_1/;
 add_proto qw/void av1_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-specialize qw/av1_inv_txfm2d_add_16x16 sse4_1/;
 add_proto qw/void av1_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-specialize qw/av1_inv_txfm2d_add_32x32 avx2/;
 
 add_proto qw/void av1_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_32x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -132,8 +146,6 @@ add_proto qw/void av1_inv_txfm2d_add_64x32/, "const int32_t *input, uint16_t *ou
 add_proto qw/void av1_inv_txfm2d_add_16x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_64x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 
-specialize qw/av1_inv_txfm2d_add_64x64 sse4_1/;
-
 add_proto qw/void av1_inv_txfm2d_add_4x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_16x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_8x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -146,13 +158,13 @@ add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride
 
 # build compound seg mask functions
 add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w";
-specialize qw/av1_build_compound_diffwtd_mask sse4_1/;
+specialize qw/av1_build_compound_diffwtd_mask sse4_1 avx2/;
 
 add_proto qw/void av1_build_compound_diffwtd_mask_highbd/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd";
 specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
 
 add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd";
-specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 neon/;
+specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 avx2 neon/;
 
 #
 # Encoder functions below this point.
@@ -186,7 +198,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_8x16 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_16x8 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -203,6 +217,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
 
   add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_64x64 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -218,7 +233,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/void av1_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
   specialize qw/av1_temporal_filter_apply sse2 msa/;
 
-  add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+  add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
 
   # ENCODEMB INVOKE
 
@@ -238,7 +253,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/void av1_get_nz_map_contexts/, "const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts";
   specialize qw/av1_get_nz_map_contexts sse2/;
   add_proto qw/void av1_txb_init_levels/, "const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels";
-  specialize qw/av1_txb_init_levels sse4_1/;
+  specialize qw/av1_txb_init_levels sse4_1 avx2/;
 
   add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
   specialize qw/av1_wedge_sse_from_residuals sse2 avx2/;
@@ -251,6 +266,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length";
   specialize qw/av1_get_crc32c_value sse4_2/;
 
+  add_proto qw/void av1_compute_stats/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride,  double *M, double *H";
+  specialize qw/av1_compute_stats sse4_1 avx2/;
+
+  add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+  specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2/;
 }
 # end encoder functions
 
@@ -275,7 +295,7 @@ if ($opts{config} !~ /libs-x86-win32-vs.*/) {
 # WARPED_MOTION / GLOBAL_MOTION functions
 
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-specialize qw/av1_warp_affine sse4_1/;
+specialize qw/av1_warp_affine sse4_1 neon/;
 
 add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
 specialize qw/av1_highbd_warp_affine sse4_1/;
@@ -290,9 +310,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
 specialize qw/apply_selfguided_restoration sse4_1 avx2 neon/;
 
-add_proto qw/void av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
-                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
-                                  int sgr_params_idx, int bit_depth, int highbd";
+add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd";
 specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;
 
 # CONVOLVE_ROUND/COMPOUND_ROUND functions
diff --git a/third_party/aom/av1/common/av1_txfm.c b/third_party/aom/av1/common/av1_txfm.c
index 1e6654121..bb70eab70 100644
--- a/third_party/aom/av1/common/av1_txfm.c
+++ b/third_party/aom/av1/common/av1_txfm.c
@@ -108,3 +108,53 @@ const int8_t av1_txfm_stage_num_list[TXFM_TYPES] = {
   1,   // TXFM_TYPE_IDENTITY16
   1,   // TXFM_TYPE_IDENTITY32
 };
+
+void av1_range_check_buf(int32_t stage, const int32_t *input,
+                         const int32_t *buf, int32_t size, int8_t bit) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  const int64_t max_value = (1LL << (bit - 1)) - 1;
+  const int64_t min_value = -(1LL << (bit - 1));
+
+  int in_range = 1;
+
+  for (int i = 0; i < size; ++i) {
+    if (buf[i] < min_value || buf[i] > max_value) {
+      in_range = 0;
+    }
+  }
+
+  if (!in_range) {
+    fprintf(stderr, "Error: coeffs contain out-of-range values\n");
+    fprintf(stderr, "size: %d\n", size);
+    fprintf(stderr, "stage: %d\n", stage);
+    fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value,
+            max_value);
+
+    fprintf(stderr, "coeffs: ");
+
+    fprintf(stderr, "[");
+    for (int j = 0; j < size; j++) {
+      if (j > 0) fprintf(stderr, ", ");
+      fprintf(stderr, "%d", input[j]);
+    }
+    fprintf(stderr, "]\n");
+
+    fprintf(stderr, "   buf: ");
+
+    fprintf(stderr, "[");
+    for (int j = 0; j < size; j++) {
+      if (j > 0) fprintf(stderr, ", ");
+      fprintf(stderr, "%d", buf[j]);
+    }
+    fprintf(stderr, "]\n\n");
+  }
+
+  assert(in_range);
+#else
+  (void)stage;
+  (void)input;
+  (void)buf;
+  (void)size;
+  (void)bit;
+#endif
+}
diff --git a/third_party/aom/av1/common/av1_txfm.h b/third_party/aom/av1/common/av1_txfm.h
index c9cc79852..59d64ca4a 100644
--- a/third_party/aom/av1/common/av1_txfm.h
+++ b/third_party/aom/av1/common/av1_txfm.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_TXFM_H_
-#define AV1_TXFM_H_
+#ifndef AOM_AV1_COMMON_AV1_TXFM_H_
+#define AOM_AV1_COMMON_AV1_TXFM_H_
 
 #include <assert.h>
 #include <math.h>
@@ -39,7 +39,7 @@ extern const int32_t av1_sinpi_arr_data[7][5];
 static const int cos_bit_min = 10;
 static const int cos_bit_max = 16;
 
-static const int NewSqrt2Bits = 12;
+#define NewSqrt2Bits ((int32_t)12)
 // 2^12 * sqrt(2)
 static const int32_t NewSqrt2 = 5793;
 // 2^12 / sqrt(2)
@@ -64,7 +64,7 @@ static INLINE int32_t range_check_value(int32_t value, int8_t bit) {
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
 #if DO_RANGE_CHECK_CLAMP
   bit = AOMMIN(bit, 31);
-  return clamp(value, (1 << (bit - 1)) - 1, -(1 << (bit - 1)));
+  return clamp(value, -(1 << (bit - 1)), (1 << (bit - 1)) - 1);
 #endif  // DO_RANGE_CHECK_CLAMP
   (void)bit;
   return value;
@@ -78,10 +78,25 @@ static INLINE int32_t round_shift(int64_t value, int bit) {
 static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
                                int bit) {
   int64_t result_64 = (int64_t)(w0 * in0) + (int64_t)(w1 * in1);
+  int64_t intermediate = result_64 + (1LL << (bit - 1));
+  // NOTE(david.barker): The value 'result_64' may not necessarily fit
+  // into 32 bits. However, the result of this function is nominally
+  // ROUND_POWER_OF_TWO_64(result_64, bit)
+  // and that is required to fit into stage_range[stage] many bits
+  // (checked by range_check_buf()).
+  //
+  // Here we've unpacked that rounding operation, and it can be shown
+  // that the value of 'intermediate' here *does* fit into 32 bits
+  // for any conformant bitstream.
+  // The upshot is that, if you do all this calculation using
+  // wrapping 32-bit arithmetic instead of (non-wrapping) 64-bit arithmetic,
+  // then you'll still get the correct result.
+  // To provide a check on this logic, we assert that 'intermediate'
+  // would fit into an int32 if range checking is enabled.
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
-  assert(result_64 >= INT32_MIN && result_64 <= INT32_MAX);
+  assert(intermediate >= INT32_MIN && intermediate <= INT32_MAX);
 #endif
-  return round_shift(result_64, bit);
+  return (int32_t)(intermediate >> bit);
 }
 
 static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
@@ -206,9 +221,12 @@ static INLINE int get_txw_idx(TX_SIZE tx_size) {
 static INLINE int get_txh_idx(TX_SIZE tx_size) {
   return tx_size_high_log2[tx_size] - tx_size_high_log2[0];
 }
+
+void av1_range_check_buf(int32_t stage, const int32_t *input,
+                         const int32_t *buf, int32_t size, int8_t bit);
 #define MAX_TXWH_IDX 5
 #ifdef __cplusplus
 }
 #endif  // __cplusplus
 
-#endif  // AV1_TXFM_H_
+#endif  // AOM_AV1_COMMON_AV1_TXFM_H_
diff --git a/third_party/aom/av1/common/blockd.c b/third_party/aom/av1/common/blockd.c
index 86b4b5d6c..2e796b656 100644
--- a/third_party/aom/av1/common/blockd.c
+++ b/third_party/aom/av1/common/blockd.c
@@ -28,66 +28,6 @@ PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) {
   return above_mi->mode;
 }
 
-void av1_foreach_transformed_block_in_plane(
-    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
-    foreach_transformed_block_visitor visit, void *arg) {
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
-  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
-  // transform size varies per plane, look it up in a common way.
-  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
-  const uint8_t txw_unit = tx_size_wide_unit[tx_size];
-  const uint8_t txh_unit = tx_size_high_unit[tx_size];
-  const int step = txw_unit * txh_unit;
-  int i = 0, r, c;
-
-  // If mb_to_right_edge is < 0 we are in a situation in which
-  // the current block size extends into the UMV and we won't
-  // visit the sub blocks that are wholly within the UMV.
-  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
-
-  int blk_row, blk_col;
-
-  const BLOCK_SIZE max_unit_bsize =
-      get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
-  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
-  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
-  mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
-  mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
-
-  // Keep track of the row and column of the blocks we use so that we know
-  // if we are in the unrestricted motion border.
-  for (r = 0; r < max_blocks_high; r += mu_blocks_high) {
-    const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high);
-    // Skip visiting the sub blocks that are wholly within the UMV.
-    for (c = 0; c < max_blocks_wide; c += mu_blocks_wide) {
-      const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide);
-      for (blk_row = r; blk_row < unit_height; blk_row += txh_unit) {
-        for (blk_col = c; blk_col < unit_width; blk_col += txw_unit) {
-          visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg);
-          i += step;
-        }
-      }
-    }
-  }
-}
-
-void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
-                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                   foreach_transformed_block_visitor visit,
-                                   void *arg, const int num_planes) {
-  for (int plane = 0; plane < num_planes; ++plane) {
-    if (!is_chroma_reference(mi_row, mi_col, bsize,
-                             xd->plane[plane].subsampling_x,
-                             xd->plane[plane].subsampling_y))
-      continue;
-    av1_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
-  }
-}
-
 void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                       int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                       int has_eob, int aoff, int loff) {
@@ -159,6 +99,10 @@ void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
     xd->plane[i].subsampling_x = i ? ss_x : 0;
     xd->plane[i].subsampling_y = i ? ss_y : 0;
   }
+  for (i = num_planes; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].subsampling_x = 1;
+    xd->plane[i].subsampling_y = 1;
+  }
 }
 
 const int16_t dr_intra_derivative[90] = {
diff --git a/third_party/aom/av1/common/blockd.h b/third_party/aom/av1/common/blockd.h
index 979f13bd9..a2311c1b0 100644
--- a/third_party/aom/av1/common/blockd.h
+++ b/third_party/aom/av1/common/blockd.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_BLOCKD_H_
-#define AV1_COMMON_BLOCKD_H_
+#ifndef AOM_AV1_COMMON_BLOCKD_H_
+#define AOM_AV1_COMMON_BLOCKD_H_
 
 #include "config/aom_config.h"
 
@@ -38,13 +38,13 @@ extern "C" {
 #define MAX_DIFFWTD_MASK_BITS 1
 
 // DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   DIFFWTD_38 = 0,
   DIFFWTD_38_INV,
   DIFFWTD_MASK_TYPES,
 } DIFFWTD_MASK_TYPE;
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   KEY_FRAME = 0,
   INTER_FRAME = 1,
   INTRA_ONLY_FRAME = 2,  // replaces intra-only
@@ -57,7 +57,7 @@ static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) {
 }
 
 static INLINE int is_inter_mode(PREDICTION_MODE mode) {
-  return mode >= NEARESTMV && mode <= NEW_NEWMV;
+  return mode >= INTER_MODE_START && mode < INTER_MODE_END;
 }
 
 typedef struct {
@@ -66,10 +66,10 @@ typedef struct {
 } BUFFER_SET;
 
 static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) {
-  return mode >= NEARESTMV && mode <= NEWMV;
+  return mode >= SINGLE_INTER_MODE_START && mode < SINGLE_INTER_MODE_END;
 }
 static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) {
-  return mode >= NEAREST_NEARESTMV && mode <= NEW_NEWMV;
+  return mode >= COMP_INTER_MODE_START && mode < COMP_INTER_MODE_END;
 }
 
 static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
@@ -148,10 +148,6 @@ static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
           mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV);
 }
 
-static INLINE int use_masked_motion_search(COMPOUND_TYPE type) {
-  return (type == COMPOUND_WEDGE);
-}
-
 static INLINE int is_masked_compound_type(COMPOUND_TYPE type) {
   return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD);
 }
@@ -267,8 +263,8 @@ typedef struct MB_MODE_INFO {
   int mi_row;
   int mi_col;
 #endif
-  int num_proj_ref[2];
-  WarpedMotionParams wm_params[2];
+  int num_proj_ref;
+  WarpedMotionParams wm_params;
 
   // Index of the alpha Cb and alpha Cr combination
   int cfl_alpha_idx;
@@ -376,7 +372,7 @@ static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col,
 }
 #endif
 
-enum mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 };
+enum ATTRIBUTE_PACKED mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 };
 
 struct buf_2d {
   uint8_t *buf;
@@ -500,6 +496,8 @@ typedef struct jnt_comp_params {
   int bck_offset;
 } JNT_COMP_PARAMS;
 
+// Most/all of the pointers are mere pointers to actual arrays are allocated
+// elsewhere. This is mostly for coding convenience.
 typedef struct macroblockd {
   struct macroblockd_plane plane[MAX_MB_PLANE];
 
@@ -544,7 +542,7 @@ typedef struct macroblockd {
   SgrprojInfo sgrproj_info[MAX_MB_PLANE];
 
   // block dimension in the unit of mode_info.
-  uint8_t n8_w, n8_h;
+  uint8_t n4_w, n4_h;
 
   uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
   CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
@@ -599,6 +597,9 @@ typedef struct macroblockd {
   uint16_t cb_offset[MAX_MB_PLANE];
   uint16_t txb_offset[MAX_MB_PLANE];
   uint16_t color_index_map_offset[2];
+
+  CONV_BUF_TYPE *tmp_conv_dst;
+  uint8_t *tmp_obmc_bufs[2];
 } MACROBLOCKD;
 
 static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) {
@@ -623,6 +624,11 @@ static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
   }
 }
 
+// For a square block size 'bsize', returns the size of the sub-blocks used by
+// the given partition type. If the partition produces sub-blocks of different
+// sizes, then the function returns the largest sub-block size.
+// Implements the Partition_Subsize lookup table in the spec (Section 9.3.
+// Conversion tables).
 // Note: the input block size should be square.
 // Otherwise it's considered invalid.
 static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize,
@@ -781,6 +787,8 @@ static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
   return intra_mode_to_tx_type(mbmi, plane_type);
 }
 
+// Implements the get_plane_residual_size() function in the spec (Section
+// 5.11.38. Get plane residual size function).
 static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
                                               int subsampling_x,
                                               int subsampling_y) {
@@ -952,15 +960,6 @@ typedef void (*foreach_transformed_block_visitor)(int plane, int block,
                                                   BLOCK_SIZE plane_bsize,
                                                   TX_SIZE tx_size, void *arg);
 
-void av1_foreach_transformed_block_in_plane(
-    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
-    foreach_transformed_block_visitor visit, void *arg);
-
-void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
-                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                   foreach_transformed_block_visitor visit,
-                                   void *arg, const int num_planes);
-
 void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                       int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                       int has_eob, int aoff, int loff);
@@ -976,7 +975,7 @@ static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
 }
 
 static INLINE int is_interintra_allowed_mode(const PREDICTION_MODE mode) {
-  return (mode >= NEARESTMV) && (mode <= NEWMV);
+  return (mode >= SINGLE_INTER_MODE_START) && (mode < SINGLE_INTER_MODE_END);
 }
 
 static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
@@ -1045,7 +1044,7 @@ motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
       is_motion_variation_allowed_compound(mbmi)) {
     if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
     assert(!has_second_ref(mbmi));
-    if (mbmi->num_proj_ref[0] >= 1 &&
+    if (mbmi->num_proj_ref >= 1 &&
         (allow_warped_motion && !av1_is_scaled(&(xd->block_refs[0]->sf)))) {
       if (xd->cur_frame_force_integer_mv) {
         return OBMC_CAUSAL;
@@ -1174,4 +1173,4 @@ static INLINE int av1_get_max_eob(TX_SIZE tx_size) {
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_BLOCKD_H_
+#endif  // AOM_AV1_COMMON_BLOCKD_H_
diff --git a/third_party/aom/av1/common/cdef.h b/third_party/aom/av1/common/cdef.h
index 092230de9..3b2eac8a5 100644
--- a/third_party/aom/av1/common/cdef.h
+++ b/third_party/aom/av1/common/cdef.h
@@ -8,8 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AV1_COMMON_CDEF_H_
-#define AV1_COMMON_CDEF_H_
+#ifndef AOM_AV1_COMMON_CDEF_H_
+#define AOM_AV1_COMMON_CDEF_H_
 
 #define CDEF_STRENGTH_BITS 6
 
@@ -48,4 +48,4 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
 #ifdef __cplusplus
 }  // extern "C"
 #endif
-#endif  // AV1_COMMON_CDEF_H_
+#endif  // AOM_AV1_COMMON_CDEF_H_
diff --git a/third_party/aom/av1/common/cdef_block.h b/third_party/aom/av1/common/cdef_block.h
index 81c6da077..6b4452cd6 100644
--- a/third_party/aom/av1/common/cdef_block.h
+++ b/third_party/aom/av1/common/cdef_block.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#if !defined(_CDEF_BLOCK_H)
-#define _CDEF_BLOCK_H (1)
+#ifndef AOM_AV1_COMMON_CDEF_BLOCK_H_
+#define AOM_AV1_COMMON_CDEF_BLOCK_H_
 
 #include "av1/common/odintrin.h"
 
@@ -56,4 +56,4 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
                     cdef_list *dlist, int cdef_count, int level,
                     int sec_strength, int pri_damping, int sec_damping,
                     int coeff_shift);
-#endif
+#endif  // AOM_AV1_COMMON_CDEF_BLOCK_H_
diff --git a/third_party/aom/av1/common/cdef_block_simd.h b/third_party/aom/av1/common/cdef_block_simd.h
index d24a7c0fa..14587a023 100644
--- a/third_party/aom/av1/common/cdef_block_simd.h
+++ b/third_party/aom/av1/common/cdef_block_simd.h
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
+#define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
+
 #include "config/av1_rtcd.h"
 
 #include "av1/common/cdef_block.h"
@@ -913,3 +916,5 @@ void SIMD_FUNC(copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
     }
   }
 }
+
+#endif  // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
diff --git a/third_party/aom/av1/common/cfl.h b/third_party/aom/av1/common/cfl.h
index bc9fbce1b..d627891bf 100644
--- a/third_party/aom/av1/common/cfl.h
+++ b/third_party/aom/av1/common/cfl.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_CFL_H_
-#define AV1_COMMON_CFL_H_
+#ifndef AOM_AV1_COMMON_CFL_H_
+#define AOM_AV1_COMMON_CFL_H_
 
 #include "av1/common/blockd.h"
 #include "av1/common/onyxc_int.h"
@@ -299,4 +299,4 @@ void cfl_predict_hbd_null(const int16_t *pred_buf_q3, uint16_t *dst,
     return pred[tx_size % TX_SIZES_ALL];                                  \
   }
 
-#endif  // AV1_COMMON_CFL_H_
+#endif  // AOM_AV1_COMMON_CFL_H_
diff --git a/third_party/aom/av1/common/common.h b/third_party/aom/av1/common/common.h
index 72c6d3a1e..bed6083db 100644
--- a/third_party/aom/av1/common/common.h
+++ b/third_party/aom/av1/common/common.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_COMMON_H_
-#define AV1_COMMON_COMMON_H_
+#ifndef AOM_AV1_COMMON_COMMON_H_
+#define AOM_AV1_COMMON_COMMON_H_
 
 /* Interface header for common constant data structures and lookup tables */
 
@@ -60,4 +60,4 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_COMMON_H_
+#endif  // AOM_AV1_COMMON_COMMON_H_
diff --git a/third_party/aom/av1/common/common_data.h b/third_party/aom/av1/common/common_data.h
index f521f10bf..46e455fdb 100644
--- a/third_party/aom/av1/common/common_data.h
+++ b/third_party/aom/av1/common/common_data.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_COMMON_DATA_H_
-#define AV1_COMMON_COMMON_DATA_H_
+#ifndef AOM_AV1_COMMON_COMMON_DATA_H_
+#define AOM_AV1_COMMON_COMMON_DATA_H_
 
 #include "av1/common/enums.h"
 #include "aom/aom_integer.h"
@@ -20,34 +20,43 @@
 extern "C" {
 #endif
 
-// Log 2 conversion lookup tables in units of mode info(4x4).
+// Log 2 conversion lookup tables in units of mode info (4x4).
+// The Mi_Width_Log2 table in the spec (Section 9.3. Conversion tables).
 static const uint8_t mi_size_wide_log2[BLOCK_SIZES_ALL] = {
   0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 0, 2, 1, 3, 2, 4
 };
+// The Mi_Height_Log2 table in the spec (Section 9.3. Conversion tables).
 static const uint8_t mi_size_high_log2[BLOCK_SIZES_ALL] = {
   0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 2, 0, 3, 1, 4, 2
 };
 
+// Width/height lookup tables in units of mode info (4x4).
+// The Num_4x4_Blocks_Wide table in the spec (Section 9.3. Conversion tables).
 static const uint8_t mi_size_wide[BLOCK_SIZES_ALL] = {
   1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 1, 4, 2, 8, 4, 16
 };
 
+// The Num_4x4_Blocks_High table in the spec (Section 9.3. Conversion tables).
 static const uint8_t mi_size_high[BLOCK_SIZES_ALL] = {
   1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 4, 1, 8, 2, 16, 4
 };
 
-// Width/height lookup tables in units of various block sizes
+// Width/height lookup tables in units of samples.
+// The Block_Width table in the spec (Section 9.3. Conversion tables).
 static const uint8_t block_size_wide[BLOCK_SIZES_ALL] = {
   4,  4,  8,  8,   8,   16, 16, 16, 32, 32, 32,
   64, 64, 64, 128, 128, 4,  16, 8,  32, 16, 64
 };
 
+// The Block_Height table in the spec (Section 9.3. Conversion tables).
 static const uint8_t block_size_high[BLOCK_SIZES_ALL] = {
   4,  8,  4,   8,  16,  8,  16, 32, 16, 32, 64,
   32, 64, 128, 64, 128, 16, 4,  32, 8,  64, 16
 };
 
-// AOMMIN(3, AOMMIN(b_width_log2(bsize), b_height_log2(bsize)))
+// Maps a block size to a context.
+// The Size_Group table in the spec (Section 9.3. Conversion tables).
+// AOMMIN(3, AOMMIN(mi_size_wide_log2(bsize), mi_size_high_log2(bsize)))
 static const uint8_t size_group_lookup[BLOCK_SIZES_ALL] = {
   0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2
 };
@@ -56,6 +65,8 @@ static const uint8_t num_pels_log2_lookup[BLOCK_SIZES_ALL] = {
   4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 6, 6, 8, 8, 10, 10
 };
 
+// A compressed version of the Partition_Subsize table in the spec (9.3.
+// Conversion tables), for square block sizes only.
 /* clang-format off */
 static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][SQR_BLOCK_SIZES] = {
   {     // PARTITION_NONE
@@ -350,34 +361,36 @@ static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
   TX_64X64,  // TX_MODE_LARGEST
   TX_64X64,  // TX_MODE_SELECT
 };
-/* clang-format on */
 
+// The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual
+// size function).
 static const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES_ALL][2][2] = {
-  //  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
-  //  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
-  { { BLOCK_4X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
-  { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
-  { { BLOCK_8X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
-  { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } },
-  { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_4X16, BLOCK_4X8 } },
-  { { BLOCK_16X8, BLOCK_16X4 }, { BLOCK_8X8, BLOCK_8X4 } },
-  { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } },
-  { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_8X32, BLOCK_8X16 } },
-  { { BLOCK_32X16, BLOCK_32X8 }, { BLOCK_16X16, BLOCK_16X8 } },
-  { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } },
-  { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_16X64, BLOCK_16X32 } },
-  { { BLOCK_64X32, BLOCK_64X16 }, { BLOCK_32X32, BLOCK_32X16 } },
-  { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } },
-  { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } },
-  { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } },
-  { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } },
-  { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_4X16, BLOCK_4X8 } },
-  { { BLOCK_16X4, BLOCK_16X4 }, { BLOCK_8X4, BLOCK_8X4 } },
-  { { BLOCK_8X32, BLOCK_8X16 }, { BLOCK_INVALID, BLOCK_4X16 } },
-  { { BLOCK_32X8, BLOCK_INVALID }, { BLOCK_16X8, BLOCK_16X4 } },
-  { { BLOCK_16X64, BLOCK_16X32 }, { BLOCK_INVALID, BLOCK_8X32 } },
-  { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } }
+  //  ss_x == 0      ss_x == 0          ss_x == 1      ss_x == 1
+  //  ss_y == 0      ss_y == 1          ss_y == 0      ss_y == 1
+  { { BLOCK_4X4,     BLOCK_4X4 },     { BLOCK_4X4,     BLOCK_4X4 } },
+  { { BLOCK_4X8,     BLOCK_4X4 },     { BLOCK_INVALID, BLOCK_4X4 } },
+  { { BLOCK_8X4,     BLOCK_INVALID }, { BLOCK_4X4,     BLOCK_4X4 } },
+  { { BLOCK_8X8,     BLOCK_8X4 },     { BLOCK_4X8,     BLOCK_4X4 } },
+  { { BLOCK_8X16,    BLOCK_8X8 },     { BLOCK_INVALID, BLOCK_4X8 } },
+  { { BLOCK_16X8,    BLOCK_INVALID }, { BLOCK_8X8,     BLOCK_8X4 } },
+  { { BLOCK_16X16,   BLOCK_16X8 },    { BLOCK_8X16,    BLOCK_8X8 } },
+  { { BLOCK_16X32,   BLOCK_16X16 },   { BLOCK_INVALID, BLOCK_8X16 } },
+  { { BLOCK_32X16,   BLOCK_INVALID }, { BLOCK_16X16,   BLOCK_16X8 } },
+  { { BLOCK_32X32,   BLOCK_32X16 },   { BLOCK_16X32,   BLOCK_16X16 } },
+  { { BLOCK_32X64,   BLOCK_32X32 },   { BLOCK_INVALID, BLOCK_16X32 } },
+  { { BLOCK_64X32,   BLOCK_INVALID }, { BLOCK_32X32,   BLOCK_32X16 } },
+  { { BLOCK_64X64,   BLOCK_64X32 },   { BLOCK_32X64,   BLOCK_32X32 } },
+  { { BLOCK_64X128,  BLOCK_64X64 },   { BLOCK_INVALID, BLOCK_32X64 } },
+  { { BLOCK_128X64,  BLOCK_INVALID }, { BLOCK_64X64,   BLOCK_64X32 } },
+  { { BLOCK_128X128, BLOCK_128X64 },  { BLOCK_64X128,  BLOCK_64X64 } },
+  { { BLOCK_4X16,    BLOCK_4X8 },     { BLOCK_INVALID, BLOCK_4X8 } },
+  { { BLOCK_16X4,    BLOCK_INVALID }, { BLOCK_8X4,     BLOCK_8X4 } },
+  { { BLOCK_8X32,    BLOCK_8X16 },    { BLOCK_INVALID, BLOCK_4X16 } },
+  { { BLOCK_32X8,    BLOCK_INVALID }, { BLOCK_16X8,    BLOCK_16X4 } },
+  { { BLOCK_16X64,   BLOCK_16X32 },   { BLOCK_INVALID, BLOCK_8X32 } },
+  { { BLOCK_64X16,   BLOCK_INVALID }, { BLOCK_32X16,   BLOCK_32X8 } }
 };
+/* clang-format on */
 
 // Generates 5 bit field in which each bit set to 1 represents
 // a blocksize partition  11111 means we split 128x128, 64x64, 32x32, 16x16
@@ -430,4 +443,4 @@ static const int quant_dist_lookup_table[2][4][2] = {
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_COMMON_DATA_H_
+#endif  // AOM_AV1_COMMON_COMMON_DATA_H_
diff --git a/third_party/aom/av1/common/convolve.c b/third_party/aom/av1/common/convolve.c
index ed962c722..1f11126fc 100644
--- a/third_party/aom/av1/common/convolve.c
+++ b/third_party/aom/av1/common/convolve.c
@@ -173,6 +173,7 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -510,31 +511,73 @@ static void convolve_2d_scale_wrapper(
                         y_step_qn, conv_params);
 }
 
+// TODO(huisu@google.com): bilinear filtering only needs 2 taps in general. So
+// we may create optimized code to do 2-tap filtering for all bilinear filtering
+// usages, not just IntraBC.
+static void convolve_2d_for_intrabc(const uint8_t *src, int src_stride,
+                                    uint8_t *dst, int dst_stride, int w, int h,
+                                    int subpel_x_q4, int subpel_y_q4,
+                                    ConvolveParams *conv_params) {
+  const InterpFilterParams *filter_params_x =
+      subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
+  const InterpFilterParams *filter_params_y =
+      subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
+  if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
+    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                         filter_params_x, filter_params_y, 0, 0, conv_params);
+  } else if (subpel_x_q4 != 0) {
+    av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                        filter_params_y, 0, 0, conv_params);
+  } else {
+    av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                        filter_params_y, 0, 0, conv_params);
+  }
+}
+
 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             InterpFilters interp_filters, const int subpel_x_q4,
                             int x_step_q4, const int subpel_y_q4, int y_step_q4,
                             int scaled, ConvolveParams *conv_params,
-                            const struct scale_factors *sf) {
+                            const struct scale_factors *sf, int is_intrabc) {
+  assert(IMPLIES(is_intrabc, !scaled));
   (void)x_step_q4;
   (void)y_step_q4;
   (void)dst;
   (void)dst_stride;
-  InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
-  InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
+
+  if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
+    convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, subpel_x_q4,
+                            subpel_y_q4, conv_params);
+    return;
+  }
+
+  InterpFilter filter_x = 0;
+  InterpFilter filter_y = 0;
+  const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
+  const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
+  if (need_filter_params_x)
+    filter_x = av1_extract_interp_filter(interp_filters, 1);
+  if (need_filter_params_y)
+    filter_y = av1_extract_interp_filter(interp_filters, 0);
   const InterpFilterParams *filter_params_x =
-      av1_get_interp_filter_params_with_block_size(filter_x, w);
+      need_filter_params_x
+          ? av1_get_interp_filter_params_with_block_size(filter_x, w)
+          : NULL;
   const InterpFilterParams *filter_params_y =
-      av1_get_interp_filter_params_with_block_size(filter_y, h);
+      need_filter_params_y
+          ? av1_get_interp_filter_params_with_block_size(filter_y, h)
+          : NULL;
 
-  if (scaled)
+  if (scaled) {
     convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
                               filter_params_x, filter_params_y, subpel_x_q4,
                               x_step_q4, subpel_y_q4, y_step_q4, conv_params);
-  else
+  } else {
     sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
         src, src_stride, dst, dst_stride, w, h, filter_params_x,
         filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
+  }
 }
 
 void av1_highbd_convolve_2d_copy_sr_c(
@@ -964,24 +1007,68 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
   }
 }
 
+static void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride,
+                                           uint16_t *dst, int dst_stride, int w,
+                                           int h, int subpel_x_q4,
+                                           int subpel_y_q4,
+                                           ConvolveParams *conv_params,
+                                           int bd) {
+  const InterpFilterParams *filter_params_x =
+      subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
+  const InterpFilterParams *filter_params_y =
+      subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
+  if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
+    av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                                filter_params_x, filter_params_y, 0, 0,
+                                conv_params, bd);
+  } else if (subpel_x_q4 != 0) {
+    av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+                               filter_params_x, filter_params_y, 0, 0,
+                               conv_params, bd);
+  } else {
+    av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+                               filter_params_x, filter_params_y, 0, 0,
+                               conv_params, bd);
+  }
+}
+
 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
                                    uint8_t *dst8, int dst_stride, int w, int h,
                                    InterpFilters interp_filters,
                                    const int subpel_x_q4, int x_step_q4,
                                    const int subpel_y_q4, int y_step_q4,
                                    int scaled, ConvolveParams *conv_params,
-                                   const struct scale_factors *sf, int bd) {
+                                   const struct scale_factors *sf,
+                                   int is_intrabc, int bd) {
+  assert(IMPLIES(is_intrabc, !scaled));
   (void)x_step_q4;
   (void)y_step_q4;
   (void)dst_stride;
-
   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
-  InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
+
+  if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+    highbd_convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h,
+                                   subpel_x_q4, subpel_y_q4, conv_params, bd);
+    return;
+  }
+
+  InterpFilter filter_x = 0;
+  InterpFilter filter_y = 0;
+  const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
+  const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
+  if (need_filter_params_x)
+    filter_x = av1_extract_interp_filter(interp_filters, 1);
+  if (need_filter_params_y)
+    filter_y = av1_extract_interp_filter(interp_filters, 0);
   const InterpFilterParams *filter_params_x =
-      av1_get_interp_filter_params_with_block_size(filter_x, w);
+      need_filter_params_x
+          ? av1_get_interp_filter_params_with_block_size(filter_x, w)
+          : NULL;
   const InterpFilterParams *filter_params_y =
-      av1_get_interp_filter_params_with_block_size(filter_y, h);
+      need_filter_params_y
+          ? av1_get_interp_filter_params_with_block_size(filter_y, h)
+          : NULL;
 
   if (scaled) {
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -1111,7 +1198,8 @@ void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
 
   uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
   const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
+  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
 
   assert(w <= MAX_SB_SIZE);
   assert(h <= MAX_SB_SIZE);
diff --git a/third_party/aom/av1/common/convolve.h b/third_party/aom/av1/common/convolve.h
index bc2d4bccf..4109dd843 100644
--- a/third_party/aom/av1/common/convolve.h
+++ b/third_party/aom/av1/common/convolve.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_AV1_CONVOLVE_H_
-#define AV1_COMMON_AV1_CONVOLVE_H_
+#ifndef AOM_AV1_COMMON_CONVOLVE_H_
+#define AOM_AV1_COMMON_CONVOLVE_H_
 #include "av1/common/filter.h"
 
 #ifdef __cplusplus
@@ -19,7 +19,6 @@ extern "C" {
 
 typedef uint16_t CONV_BUF_TYPE;
 typedef struct ConvolveParams {
-  int ref;
   int do_average;
   CONV_BUF_TYPE *dst;
   int dst_stride;
@@ -59,15 +58,13 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
                             InterpFilters interp_filters, const int subpel_x_q4,
                             int x_step_q4, const int subpel_y_q4, int y_step_q4,
                             int scaled, ConvolveParams *conv_params,
-                            const struct scale_factors *sf);
+                            const struct scale_factors *sf, int is_intrabc);
 
-static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average,
-                                                      int plane,
+static INLINE ConvolveParams get_conv_params_no_round(int do_average, int plane,
                                                       CONV_BUF_TYPE *dst,
                                                       int dst_stride,
                                                       int is_compound, int bd) {
   ConvolveParams conv_params;
-  conv_params.ref = ref;
   conv_params.do_average = do_average;
   assert(IMPLIES(do_average, is_compound));
   conv_params.is_compound = is_compound;
@@ -88,15 +85,14 @@ static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average,
   return conv_params;
 }
 
-static INLINE ConvolveParams get_conv_params(int ref, int do_average, int plane,
+static INLINE ConvolveParams get_conv_params(int do_average, int plane,
                                              int bd) {
-  return get_conv_params_no_round(ref, do_average, plane, NULL, 0, 0, bd);
+  return get_conv_params_no_round(do_average, plane, NULL, 0, 0, bd);
 }
 
 static INLINE ConvolveParams get_conv_params_wiener(int bd) {
   ConvolveParams conv_params;
   (void)bd;
-  conv_params.ref = 0;
   conv_params.do_average = 0;
   conv_params.is_compound = 0;
   conv_params.round_0 = WIENER_ROUND0_BITS;
@@ -119,10 +115,11 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
                                    const int subpel_x_q4, int x_step_q4,
                                    const int subpel_y_q4, int y_step_q4,
                                    int scaled, ConvolveParams *conv_params,
-                                   const struct scale_factors *sf, int bd);
+                                   const struct scale_factors *sf,
+                                   int is_intrabc, int bd);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_AV1_CONVOLVE_H_
+#endif  // AOM_AV1_COMMON_CONVOLVE_H_
diff --git a/third_party/aom/av1/common/entropy.h b/third_party/aom/av1/common/entropy.h
index ef944c5a0..991692c2f 100644
--- a/third_party/aom/av1/common/entropy.h
+++ b/third_party/aom/av1/common/entropy.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_ENTROPY_H_
-#define AV1_COMMON_ENTROPY_H_
+#ifndef AOM_AV1_COMMON_ENTROPY_H_
+#define AOM_AV1_COMMON_ENTROPY_H_
 
 #include "config/aom_config.h"
 
@@ -178,4 +178,4 @@ static INLINE TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) {
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_ENTROPY_H_
+#endif  // AOM_AV1_COMMON_ENTROPY_H_
diff --git a/third_party/aom/av1/common/entropymode.h b/third_party/aom/av1/common/entropymode.h
index 0bd2e20a1..7047f34d2 100644
--- a/third_party/aom/av1/common/entropymode.h
+++ b/third_party/aom/av1/common/entropymode.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_ENTROPYMODE_H_
-#define AV1_COMMON_ENTROPYMODE_H_
+#ifndef AOM_AV1_COMMON_ENTROPYMODE_H_
+#define AOM_AV1_COMMON_ENTROPYMODE_H_
 
 #include "av1/common/entropy.h"
 #include "av1/common/entropymv.h"
@@ -186,6 +186,8 @@ void av1_set_default_mode_deltas(int8_t *mode_deltas);
 void av1_setup_frame_contexts(struct AV1Common *cm);
 void av1_setup_past_independence(struct AV1Common *cm);
 
+// Returns (int)ceil(log2(n)).
+// NOTE: This implementation only works for n <= 2^30.
 static INLINE int av1_ceil_log2(int n) {
   if (n < 2) return 0;
   int i = 1, p = 2;
@@ -207,4 +209,4 @@ int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_ENTROPYMODE_H_
+#endif  // AOM_AV1_COMMON_ENTROPYMODE_H_
diff --git a/third_party/aom/av1/common/entropymv.c b/third_party/aom/av1/common/entropymv.c
index 446aa433c..491337387 100644
--- a/third_party/aom/av1/common/entropymv.c
+++ b/third_party/aom/av1/common/entropymv.c
@@ -60,61 +60,6 @@ static const nmv_context default_nmv_context = {
     } },
 };
 
-static const uint8_t log_in_base_2[] = {
-  0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
-  4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10
-};
-
-static INLINE int mv_class_base(MV_CLASS_TYPE c) {
-  return c ? CLASS0_SIZE << (c + 2) : 0;
-}
-
-MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) {
-  const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096)
-                              ? MV_CLASS_10
-                              : (MV_CLASS_TYPE)log_in_base_2[z >> 3];
-  if (offset) *offset = z - mv_class_base(c);
-  return c;
-}
-
 void av1_init_mv_probs(AV1_COMMON *cm) {
   // NB: this sets CDFs too
   cm->fc->nmvc = default_nmv_context;
diff --git a/third_party/aom/av1/common/entropymv.h b/third_party/aom/av1/common/entropymv.h
index 02ca7b66b..fa818a2c1 100644
--- a/third_party/aom/av1/common/entropymv.h
+++ b/third_party/aom/av1/common/entropymv.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_ENTROPYMV_H_
-#define AV1_COMMON_ENTROPYMV_H_
+#ifndef AOM_AV1_COMMON_ENTROPYMV_H_
+#define AOM_AV1_COMMON_ENTROPYMV_H_
 
 #include "config/aom_config.h"
 
@@ -91,16 +91,6 @@ typedef struct {
   nmv_component comps[2];
 } nmv_context;
 
-static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
-  if (mv->row == 0) {
-    return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ;
-  } else {
-    return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ;
-  }
-}
-
-MV_CLASS_TYPE av1_get_mv_class(int z, int *offset);
-
 typedef enum {
   MV_SUBPEL_NONE = -1,
   MV_SUBPEL_LOW_PRECISION = 0,
@@ -111,4 +101,4 @@ typedef enum {
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_ENTROPYMV_H_
+#endif  // AOM_AV1_COMMON_ENTROPYMV_H_
diff --git a/third_party/aom/av1/common/enums.h b/third_party/aom/av1/common/enums.h
index 689c25f30..869c06ef2 100644
--- a/third_party/aom/av1/common/enums.h
+++ b/third_party/aom/av1/common/enums.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_ENUMS_H_
-#define AV1_COMMON_ENUMS_H_
+#ifndef AOM_AV1_COMMON_ENUMS_H_
+#define AOM_AV1_COMMON_ENUMS_H_
 
 #include "config/aom_config.h"
 
@@ -274,7 +274,7 @@ typedef enum ATTRIBUTE_PACKED {
   TX_TYPES,
 } TX_TYPE;
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   REG_REG,
   REG_SMOOTH,
   REG_SHARP,
@@ -438,6 +438,8 @@ typedef enum ATTRIBUTE_PACKED {
   COMP_INTER_MODE_START = NEAREST_NEARESTMV,
   COMP_INTER_MODE_END = MB_MODE_COUNT,
   COMP_INTER_MODE_NUM = COMP_INTER_MODE_END - COMP_INTER_MODE_START,
+  INTER_MODE_START = NEARESTMV,
+  INTER_MODE_END = MB_MODE_COUNT,
   INTRA_MODES = PAETH_PRED + 1,  // PAETH_PRED has to be the last intra mode.
   INTRA_INVALID = MB_MODE_COUNT  // For uv_mode in inter blocks
 } PREDICTION_MODE;
@@ -478,7 +480,7 @@ typedef enum ATTRIBUTE_PACKED {
   INTERINTRA_MODES
 } INTERINTRA_MODE;
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   COMPOUND_AVERAGE,
   COMPOUND_WEDGE,
   COMPOUND_DIFFWTD,
@@ -614,4 +616,4 @@ typedef enum ATTRIBUTE_PACKED {
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_ENUMS_H_
+#endif  // AOM_AV1_COMMON_ENUMS_H_
diff --git a/third_party/aom/av1/common/filter.h b/third_party/aom/av1/common/filter.h
index 7f8ad583a..571422d11 100644
--- a/third_party/aom/av1/common/filter.h
+++ b/third_party/aom/av1/common/filter.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_FILTER_H_
-#define AV1_COMMON_FILTER_H_
+#ifndef AOM_AV1_COMMON_FILTER_H_
+#define AOM_AV1_COMMON_FILTER_H_
 
 #include <assert.h>
 
@@ -139,6 +139,17 @@ static const InterpFilterParams
         BILINEAR }
     };
 
+// A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel
+// MV for luma. If sub-sampling exists, chroma may possibly use half-pel MV.
+DECLARE_ALIGNED(256, static const int16_t, av1_intrabc_bilinear_filter[2]) = {
+  64,
+  64,
+};
+
+static const InterpFilterParams av1_intrabc_filter_params = {
+  av1_intrabc_bilinear_filter, 2, 0, BILINEAR
+};
+
 DECLARE_ALIGNED(256, static const InterpKernel,
                 av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = {
   { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 0, -4, 126, 8, -2, 0, 0 },
@@ -181,6 +192,11 @@ av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter,
   return &av1_interp_filter_params_list[interp_filter];
 }
 
+static INLINE const InterpFilterParams *av1_get_4tap_interp_filter_params(
+    const InterpFilter interp_filter) {
+  return &av1_interp_4tap[interp_filter];
+}
+
 static INLINE const int16_t *av1_get_interp_filter_kernel(
     const InterpFilter interp_filter) {
   return av1_interp_filter_params_list[interp_filter].filter_ptr;
@@ -195,4 +211,4 @@ static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_FILTER_H_
+#endif  // AOM_AV1_COMMON_FILTER_H_
diff --git a/third_party/aom/av1/common/frame_buffers.c b/third_party/aom/av1/common/frame_buffers.c
index 502ccd27d..fd6c4bc79 100644
--- a/third_party/aom/av1/common/frame_buffers.c
+++ b/third_party/aom/av1/common/frame_buffers.c
@@ -38,6 +38,17 @@ void av1_free_internal_frame_buffers(InternalFrameBufferList *list) {
   list->int_fb = NULL;
 }
 
+void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list) {
+  int i;
+
+  assert(list != NULL);
+
+  for (i = 0; i < list->num_internal_frame_buffers; ++i) {
+    if (list->int_fb[i].data && !list->int_fb[i].in_use)
+      memset(list->int_fb[i].data, 0, list->int_fb[i].size);
+  }
+}
+
 int av1_get_frame_buffer(void *cb_priv, size_t min_size,
                          aom_codec_frame_buffer_t *fb) {
   int i;
diff --git a/third_party/aom/av1/common/frame_buffers.h b/third_party/aom/av1/common/frame_buffers.h
index e7341cfdd..16188e51c 100644
--- a/third_party/aom/av1/common/frame_buffers.h
+++ b/third_party/aom/av1/common/frame_buffers.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_FRAME_BUFFERS_H_
-#define AV1_COMMON_FRAME_BUFFERS_H_
+#ifndef AOM_AV1_COMMON_FRAME_BUFFERS_H_
+#define AOM_AV1_COMMON_FRAME_BUFFERS_H_
 
 #include "aom/aom_frame_buffer.h"
 #include "aom/aom_integer.h"
@@ -36,6 +36,12 @@ int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list);
 // Free any data allocated to the frame buffers.
 void av1_free_internal_frame_buffers(InternalFrameBufferList *list);
 
+// Zeros all unused internal frame buffers. In particular, this zeros the
+// frame borders. Call this function after a sequence header change to
+// re-initialize the frame borders for the different width, height, or bit
+// depth.
+void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list);
+
 // Callback used by libaom to request an external frame buffer. |cb_priv|
 // Callback private data, which points to an InternalFrameBufferList.
 // |min_size| is the minimum size in bytes needed to decode the next frame.
@@ -51,4 +57,4 @@ int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb);
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_FRAME_BUFFERS_H_
+#endif  // AOM_AV1_COMMON_FRAME_BUFFERS_H_
diff --git a/third_party/aom/av1/common/idct.c b/third_party/aom/av1/common/idct.c
index bc758eb57..2c1cb9827 100644
--- a/third_party/aom/av1/common/idct.c
+++ b/third_party/aom/av1/common/idct.c
@@ -31,21 +31,16 @@ int av1_get_tx_scale(const TX_SIZE tx_size) {
 // that input and output could be the same buffer.
 
 // idct
-static void highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest,
-                               int stride, int eob, int bd) {
+void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd) {
   if (eob > 1)
     av1_highbd_iwht4x4_16_add(input, dest, stride, bd);
   else
     av1_highbd_iwht4x4_1_add(input, dest, stride, bd);
 }
 
-static const int32_t *cast_to_int32(const tran_low_t *input) {
-  assert(sizeof(int32_t) == sizeof(tran_low_t));
-  return (const int32_t *)input;
-}
-
-void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
-                                 int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   int eob = txfm_param->eob;
   int bd = txfm_param->bd;
@@ -54,206 +49,150 @@ void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
   const TX_TYPE tx_type = txfm_param->tx_type;
   if (lossless) {
     assert(tx_type == DCT_DCT);
-    highbd_iwht4x4_add(input, dest, stride, eob, bd);
+    av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
     return;
   }
-  switch (tx_type) {
-    // Assembly version doesn't support some transform types, so use C version
-    // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    default:
-      av1_inv_txfm2d_add_4x4(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                             bd);
-      break;
-  }
+
+  av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
 }
 
-static void highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
-                                    int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
+                                 int stride, const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_4x8(src, CONVERT_TO_SHORTPTR(dest), stride,
-                         txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
-                                    int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
+                                 int stride, const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_8x4(src, CONVERT_TO_SHORTPTR(dest), stride,
-                         txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_8x16(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_8x16(src, CONVERT_TO_SHORTPTR(dest), stride,
-                          txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_16x8(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_16x8(src, CONVERT_TO_SHORTPTR(dest), stride,
-                          txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_16x32(src, CONVERT_TO_SHORTPTR(dest), stride,
-                           txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_32x16(src, CONVERT_TO_SHORTPTR(dest), stride,
-                           txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_16x4(src, CONVERT_TO_SHORTPTR(dest), stride,
-                          txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_4x16(src, CONVERT_TO_SHORTPTR(dest), stride,
-                          txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_32x8(src, CONVERT_TO_SHORTPTR(dest), stride,
-                          txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_8x32(src, CONVERT_TO_SHORTPTR(dest), stride,
-                          txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_32x64(src, CONVERT_TO_SHORTPTR(dest), stride,
-                           txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_64x32(src, CONVERT_TO_SHORTPTR(dest), stride,
-                           txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_16x64(src, CONVERT_TO_SHORTPTR(dest), stride,
-                           txfm_param->tx_type, txfm_param->bd);
-}
 
-static void highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
-  const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_64x16(src, CONVERT_TO_SHORTPTR(dest), stride,
-                           txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
 }
 
-static void highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
-                                    int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    // Assembly version doesn't support some transform types, so use C version
-    // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    default:
-      av1_inv_txfm2d_add_8x8(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+
+  av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
                              bd);
-      break;
-  }
 }
 
-static void highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
+void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    // Assembly version doesn't support some transform types, so use C version
-    // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 tx_type, bd);
-      break;
-    default:
-      av1_inv_txfm2d_add_16x16(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-  }
+  av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
   const int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_inv_txfm2d_add_32x32(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    // Assembly version doesn't support IDTX, so use C version for it.
-    case IDTX:
-      av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 tx_type, bd);
-      break;
 
-    default: assert(0);
-  }
+  av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
 }
 
-static void highbd_inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_64x64_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
   const int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
   assert(tx_type == DCT_DCT);
-  av1_inv_txfm2d_add_64x64(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
+  av1_inv_txfm2d_add_64x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
 }
 
 static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
@@ -270,70 +209,70 @@ static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
       txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
 }
 
-static void highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest,
-                                int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest,
+                               int stride, const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
     case TX_32X32:
-      highbd_inv_txfm_add_32x32(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_32x32_c(input, dest, stride, txfm_param);
       break;
     case TX_16X16:
-      highbd_inv_txfm_add_16x16(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_16x16_c(input, dest, stride, txfm_param);
       break;
     case TX_8X8:
-      highbd_inv_txfm_add_8x8(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_8x8_c(input, dest, stride, txfm_param);
       break;
     case TX_4X8:
-      highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
       break;
     case TX_8X4:
-      highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
       break;
     case TX_8X16:
-      highbd_inv_txfm_add_8x16(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_8x16_c(input, dest, stride, txfm_param);
       break;
     case TX_16X8:
-      highbd_inv_txfm_add_16x8(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_16x8_c(input, dest, stride, txfm_param);
       break;
     case TX_16X32:
-      highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
       break;
     case TX_32X16:
-      highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
       break;
     case TX_64X64:
-      highbd_inv_txfm_add_64x64(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_64x64_c(input, dest, stride, txfm_param);
       break;
     case TX_32X64:
-      highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
       break;
     case TX_64X32:
-      highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
       break;
     case TX_16X64:
-      highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param);
       break;
     case TX_64X16:
-      highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param);
       break;
     case TX_4X4:
       // this is like av1_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
       // case.
-      av1_highbd_inv_txfm_add_4x4(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param);
       break;
     case TX_16X4:
-      highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
       break;
     case TX_4X16:
-      highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
       break;
     case TX_8X32:
-      highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
       break;
     case TX_32X8:
-      highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
       break;
     default: assert(0 && "Invalid transform size"); break;
   }
@@ -352,7 +291,8 @@ void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
     }
   }
 
-  highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride, txfm_param);
+  av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
+                          txfm_param);
 
   for (int r = 0; r < h; ++r) {
     for (int c = 0; c < w; ++c) {
@@ -375,7 +315,7 @@ void av1_inverse_transform_block(const MACROBLOCKD *xd,
   assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]);
 
   if (txfm_param.is_hbd) {
-    highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+    av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
   } else {
     av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
   }
diff --git a/third_party/aom/av1/common/idct.h b/third_party/aom/av1/common/idct.h
index 50032a167..d9454e73f 100644
--- a/third_party/aom/av1/common/idct.h
+++ b/third_party/aom/av1/common/idct.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_IDCT_H_
-#define AV1_COMMON_IDCT_H_
+#ifndef AOM_AV1_COMMON_IDCT_H_
+#define AOM_AV1_COMMON_IDCT_H_
 
 #include "config/aom_config.h"
 
@@ -36,11 +36,32 @@ void av1_inverse_transform_block(const MACROBLOCKD *xd,
                                  const tran_low_t *dqcoeff, int plane,
                                  TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
                                  int stride, int eob, int reduced_tx_set);
+void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+
+static INLINE const int32_t *cast_to_int32(const tran_low_t *input) {
+  assert(sizeof(int32_t) == sizeof(tran_low_t));
+  return (const int32_t *)input;
+}
+
+typedef void(highbd_inv_txfm_add)(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *param);
+
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x8;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x4;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x32;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x16;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x64;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x32;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x64;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x16;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x4;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x16;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x32;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x8;
 
-void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
-                                 int stride, const TxfmParam *param);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_IDCT_H_
+#endif  // AOM_AV1_COMMON_IDCT_H_
diff --git a/third_party/aom/av1/common/mv.h b/third_party/aom/av1/common/mv.h
index c2495640e..5b0225192 100644
--- a/third_party/aom/av1/common/mv.h
+++ b/third_party/aom/av1/common/mv.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_MV_H_
-#define AV1_COMMON_MV_H_
+#ifndef AOM_AV1_COMMON_MV_H_
+#define AOM_AV1_COMMON_MV_H_
 
 #include "av1/common/common.h"
 #include "av1/common/common_data.h"
@@ -56,7 +56,7 @@ typedef struct mv32 {
 #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
 
 /* clang-format off */
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   IDENTITY = 0,      // identity transformation, 0-parameter
   TRANSLATION = 1,   // translational motion 2-parameter
   ROTZOOM = 2,       // simplified affine with rotation + zoom only, 4-parameter
@@ -298,4 +298,4 @@ static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row,
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_MV_H_
+#endif  // AOM_AV1_COMMON_MV_H_
diff --git a/third_party/aom/av1/common/mvref_common.c b/third_party/aom/av1/common/mvref_common.c
index 6939df335..7f24ab4e6 100644
--- a/third_party/aom/av1/common/mvref_common.c
+++ b/third_party/aom/av1/common/mvref_common.c
@@ -27,16 +27,19 @@ static void get_mv_projection(MV *output, MV ref, int num, int den) {
   den = AOMMIN(den, MAX_FRAME_DISTANCE);
   num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE)
                 : AOMMAX(num, -MAX_FRAME_DISTANCE);
-  int mv_row = ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14);
-  int mv_col = ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14);
+  const int mv_row =
+      ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14);
+  const int mv_col =
+      ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14);
   const int clamp_max = MV_UPP - 1;
   const int clamp_min = MV_LOW + 1;
   output->row = (int16_t)clamp(mv_row, clamp_min, clamp_max);
   output->col = (int16_t)clamp(mv_col, clamp_min, clamp_max);
 }
 
-void av1_copy_frame_mvs(const AV1_COMMON *const cm, MB_MODE_INFO *mi,
-                        int mi_row, int mi_col, int x_mis, int y_mis) {
+void av1_copy_frame_mvs(const AV1_COMMON *const cm,
+                        const MB_MODE_INFO *const mi, int mi_row, int mi_col,
+                        int x_mis, int y_mis) {
   const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1);
   MV_REF *frame_mvs =
       cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
@@ -141,38 +144,37 @@ static void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                           uint8_t *ref_match_count, uint8_t *newmv_count,
                           int_mv *gm_mv_candidates, int max_row_offset,
                           int *processed_rows) {
-  int end_mi = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
+  int end_mi = AOMMIN(xd->n4_w, cm->mi_cols - mi_col);
   end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]);
   const int n8_w_8 = mi_size_wide[BLOCK_8X8];
   const int n8_w_16 = mi_size_wide[BLOCK_16X16];
   int i;
   int col_offset = 0;
-  const int shift = 0;
   // TODO(jingning): Revisit this part after cb4x4 is stable.
   if (abs(row_offset) > 1) {
     col_offset = 1;
-    if ((mi_col & 0x01) && xd->n8_w < n8_w_8) --col_offset;
+    if ((mi_col & 0x01) && xd->n4_w < n8_w_8) --col_offset;
   }
-  const int use_step_16 = (xd->n8_w >= 16);
+  const int use_step_16 = (xd->n4_w >= 16);
   MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride;
   (void)mi_row;
 
   for (i = 0; i < end_mi;) {
     const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i];
     const int candidate_bsize = candidate->sb_type;
-    const int n8_w = mi_size_wide[candidate_bsize];
-    int len = AOMMIN(xd->n8_w, n8_w);
+    const int n4_w = mi_size_wide[candidate_bsize];
+    int len = AOMMIN(xd->n4_w, n4_w);
     if (use_step_16)
       len = AOMMAX(n8_w_16, len);
     else if (abs(row_offset) > 1)
       len = AOMMAX(len, n8_w_8);
 
     int weight = 2;
-    if (xd->n8_w >= n8_w_8 && xd->n8_w <= n8_w) {
+    if (xd->n4_w >= n8_w_8 && xd->n4_w <= n4_w) {
       int inc = AOMMIN(-max_row_offset + row_offset + 1,
                        mi_size_high[candidate_bsize]);
       // Obtain range used in weight calculation.
-      weight = AOMMAX(weight, (inc << shift));
+      weight = AOMMAX(weight, inc);
       // Update processed rows.
       *processed_rows = inc - row_offset - 1;
     }
@@ -192,37 +194,36 @@ static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                           uint8_t *ref_match_count, uint8_t *newmv_count,
                           int_mv *gm_mv_candidates, int max_col_offset,
                           int *processed_cols) {
-  int end_mi = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
+  int end_mi = AOMMIN(xd->n4_h, cm->mi_rows - mi_row);
   end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]);
   const int n8_h_8 = mi_size_high[BLOCK_8X8];
   const int n8_h_16 = mi_size_high[BLOCK_16X16];
   int i;
   int row_offset = 0;
-  const int shift = 0;
   if (abs(col_offset) > 1) {
     row_offset = 1;
-    if ((mi_row & 0x01) && xd->n8_h < n8_h_8) --row_offset;
+    if ((mi_row & 0x01) && xd->n4_h < n8_h_8) --row_offset;
   }
-  const int use_step_16 = (xd->n8_h >= 16);
+  const int use_step_16 = (xd->n4_h >= 16);
   (void)mi_col;
 
   for (i = 0; i < end_mi;) {
     const MB_MODE_INFO *const candidate =
         xd->mi[(row_offset + i) * xd->mi_stride + col_offset];
     const int candidate_bsize = candidate->sb_type;
-    const int n8_h = mi_size_high[candidate_bsize];
-    int len = AOMMIN(xd->n8_h, n8_h);
+    const int n4_h = mi_size_high[candidate_bsize];
+    int len = AOMMIN(xd->n4_h, n4_h);
     if (use_step_16)
       len = AOMMAX(n8_h_16, len);
     else if (abs(col_offset) > 1)
       len = AOMMAX(len, n8_h_8);
 
     int weight = 2;
-    if (xd->n8_h >= n8_h_8 && xd->n8_h <= n8_h) {
+    if (xd->n4_h >= n8_h_8 && xd->n4_h <= n4_h) {
       int inc = AOMMIN(-max_col_offset + col_offset + 1,
                        mi_size_wide[candidate_bsize]);
       // Obtain range used in weight calculation.
-      weight = AOMMAX(weight, (inc << shift));
+      weight = AOMMAX(weight, inc);
       // Update processed cols.
       *processed_cols = inc - col_offset - 1;
     }
@@ -248,7 +249,7 @@ static void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   mi_pos.row = row_offset;
   mi_pos.col = col_offset;
 
-  if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) {
+  if (is_inside(tile, mi_col, mi_row, &mi_pos)) {
     const MB_MODE_INFO *const candidate =
         xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
     const int len = mi_size_wide[BLOCK_8X8];
@@ -290,19 +291,19 @@ static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
 
   // The left hand of two vertical rectangles always has a top right (as the
   // block above will have been decoded)
-  if (xd->n8_w < xd->n8_h)
+  if (xd->n4_w < xd->n4_h)
     if (!xd->is_sec_rect) has_tr = 1;
 
   // The bottom of two horizontal rectangles never has a top right (as the block
   // to the right won't have been decoded)
-  if (xd->n8_w > xd->n8_h)
+  if (xd->n4_w > xd->n4_h)
     if (xd->is_sec_rect) has_tr = 0;
 
   // The bottom left square of a Vertical A (in the old format) does
   // not have a top right as it is decoded before the right hand
   // rectangle of the partition
   if (xd->mi[0]->partition == PARTITION_VERT_A) {
-    if (xd->n8_w == xd->n8_h)
+    if (xd->n4_w == xd->n4_h)
       if (mask_row & bs) has_tr = 0;
   }
 
@@ -335,7 +336,7 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1;
   mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1;
 
-  if (!is_inside(&xd->tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) return 0;
+  if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0;
 
   const TPL_MV_REF *prev_frame_mvs =
       cm->tpl_mvs + ((mi_row + mi_pos.row) >> 1) * (cm->mi_stride >> 1) +
@@ -430,20 +431,75 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   return 0;
 }
 
+static void process_compound_ref_mv_candidate(
+    const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
+    const MV_REFERENCE_FRAME *const rf, int_mv ref_id[2][2],
+    int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) {
+  for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+    MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
+
+    for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
+      if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
+        ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
+        ++ref_id_count[cmp_idx];
+      } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
+        int_mv this_mv = candidate->mv[rf_idx];
+        if (cm->ref_frame_sign_bias[can_rf] !=
+            cm->ref_frame_sign_bias[rf[cmp_idx]]) {
+          this_mv.as_mv.row = -this_mv.as_mv.row;
+          this_mv.as_mv.col = -this_mv.as_mv.col;
+        }
+        ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
+        ++ref_diff_count[cmp_idx];
+      }
+    }
+  }
+}
+
+static void process_single_ref_mv_candidate(
+    const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
+    MV_REFERENCE_FRAME ref_frame, uint8_t refmv_count[MODE_CTX_REF_FRAMES],
+    CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE]) {
+  for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+    if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
+      int_mv this_mv = candidate->mv[rf_idx];
+      if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
+          cm->ref_frame_sign_bias[ref_frame]) {
+        this_mv.as_mv.row = -this_mv.as_mv.row;
+        this_mv.as_mv.col = -this_mv.as_mv.col;
+      }
+      int stack_idx;
+      for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
+        const int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
+        if (this_mv.as_int == stack_mv.as_int) break;
+      }
+
+      if (stack_idx == refmv_count[ref_frame]) {
+        ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
+
+        // TODO(jingning): Set an arbitrary small number here. The weight
+        // doesn't matter as long as it is properly initialized.
+        ref_mv_stack[ref_frame][stack_idx].weight = 2;
+        ++refmv_count[ref_frame];
+      }
+    }
+  }
+}
+
 static void setup_ref_mv_list(
     const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame,
     uint8_t refmv_count[MODE_CTX_REF_FRAMES],
     CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
     int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates,
     int mi_row, int mi_col, int16_t *mode_context) {
-  const int bs = AOMMAX(xd->n8_w, xd->n8_h);
+  const int bs = AOMMAX(xd->n4_w, xd->n4_h);
   const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs);
   MV_REFERENCE_FRAME rf[2];
 
   const TileInfo *const tile = &xd->tile;
   int max_row_offset = 0, max_col_offset = 0;
-  const int row_adj = (xd->n8_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
-  const int col_adj = (xd->n8_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
+  const int row_adj = (xd->n4_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
+  const int col_adj = (xd->n4_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
   int processed_rows = 0;
   int processed_cols = 0;
 
@@ -455,17 +511,16 @@ static void setup_ref_mv_list(
   if (xd->up_available) {
     max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj;
 
-    if (xd->n8_h < mi_size_high[BLOCK_8X8])
+    if (xd->n4_h < mi_size_high[BLOCK_8X8])
       max_row_offset = -(2 << 1) + row_adj;
 
-    max_row_offset =
-        find_valid_row_offset(tile, mi_row, cm->mi_rows, max_row_offset);
+    max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset);
   }
 
   if (xd->left_available) {
     max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj;
 
-    if (xd->n8_w < mi_size_wide[BLOCK_8X8])
+    if (xd->n4_w < mi_size_wide[BLOCK_8X8])
       max_col_offset = -(2 << 1) + col_adj;
 
     max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset);
@@ -487,12 +542,12 @@ static void setup_ref_mv_list(
                   gm_mv_candidates, max_col_offset, &processed_cols);
   // Check top-right boundary
   if (has_tr)
-    scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n8_w,
+    scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n4_w,
                   ref_mv_stack[ref_frame], &row_match_count, &newmv_count,
                   gm_mv_candidates, &refmv_count[ref_frame]);
 
-  uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
-  uint8_t nearest_refmv_count = refmv_count[ref_frame];
+  const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
+  const uint8_t nearest_refmv_count = refmv_count[ref_frame];
 
   // TODO(yunqing): for comp_search, do it for all 3 cases.
   for (int idx = 0; idx < nearest_refmv_count; ++idx)
@@ -500,27 +555,27 @@ static void setup_ref_mv_list(
 
   if (cm->allow_ref_frame_mvs) {
     int is_available = 0;
-    const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n8_h);
-    const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n8_w);
-    const int blk_row_end = AOMMIN(xd->n8_h, mi_size_high[BLOCK_64X64]);
-    const int blk_col_end = AOMMIN(xd->n8_w, mi_size_wide[BLOCK_64X64]);
+    const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n4_h);
+    const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n4_w);
+    const int blk_row_end = AOMMIN(xd->n4_h, mi_size_high[BLOCK_64X64]);
+    const int blk_col_end = AOMMIN(xd->n4_w, mi_size_wide[BLOCK_64X64]);
 
     const int tpl_sample_pos[3][2] = {
       { voffset, -2 },
       { voffset, hoffset },
       { voffset - 2, hoffset },
     };
-    const int allow_extension = (xd->n8_h >= mi_size_high[BLOCK_8X8]) &&
-                                (xd->n8_h < mi_size_high[BLOCK_64X64]) &&
-                                (xd->n8_w >= mi_size_wide[BLOCK_8X8]) &&
-                                (xd->n8_w < mi_size_wide[BLOCK_64X64]);
-
-    int step_h = (xd->n8_h >= mi_size_high[BLOCK_64X64])
-                     ? mi_size_high[BLOCK_16X16]
-                     : mi_size_high[BLOCK_8X8];
-    int step_w = (xd->n8_w >= mi_size_wide[BLOCK_64X64])
-                     ? mi_size_wide[BLOCK_16X16]
-                     : mi_size_wide[BLOCK_8X8];
+    const int allow_extension = (xd->n4_h >= mi_size_high[BLOCK_8X8]) &&
+                                (xd->n4_h < mi_size_high[BLOCK_64X64]) &&
+                                (xd->n4_w >= mi_size_wide[BLOCK_8X8]) &&
+                                (xd->n4_w < mi_size_wide[BLOCK_64X64]);
+
+    const int step_h = (xd->n4_h >= mi_size_high[BLOCK_64X64])
+                           ? mi_size_high[BLOCK_16X16]
+                           : mi_size_high[BLOCK_8X8];
+    const int step_w = (xd->n4_w >= mi_size_wide[BLOCK_64X64])
+                           ? mi_size_wide[BLOCK_16X16]
+                           : mi_size_wide[BLOCK_8X8];
 
     for (int blk_row = 0; blk_row < blk_row_end; blk_row += step_h) {
       for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) {
@@ -569,7 +624,7 @@ static void setup_ref_mv_list(
                     max_col_offset, &processed_cols);
   }
 
-  uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
+  const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
 
   switch (nearest_match) {
     case 0:
@@ -636,62 +691,24 @@ static void setup_ref_mv_list(
       int_mv ref_id[2][2], ref_diff[2][2];
       int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 };
 
-      int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n8_w);
+      int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w);
       mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
-      int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n8_h);
+      int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h);
       mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
       int mi_size = AOMMIN(mi_width, mi_height);
 
       for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) {
         const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
-        const int candidate_bsize = candidate->sb_type;
-
-        for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
-          MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
-
-          for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
-            if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
-              ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
-              ++ref_id_count[cmp_idx];
-            } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
-              int_mv this_mv = candidate->mv[rf_idx];
-              if (cm->ref_frame_sign_bias[can_rf] !=
-                  cm->ref_frame_sign_bias[rf[cmp_idx]]) {
-                this_mv.as_mv.row = -this_mv.as_mv.row;
-                this_mv.as_mv.col = -this_mv.as_mv.col;
-              }
-              ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
-              ++ref_diff_count[cmp_idx];
-            }
-          }
-        }
-        idx += mi_size_wide[candidate_bsize];
+        process_compound_ref_mv_candidate(
+            candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
+        idx += mi_size_wide[candidate->sb_type];
       }
 
       for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) {
         const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
-        const int candidate_bsize = candidate->sb_type;
-
-        for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
-          MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
-
-          for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
-            if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
-              ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
-              ++ref_id_count[cmp_idx];
-            } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
-              int_mv this_mv = candidate->mv[rf_idx];
-              if (cm->ref_frame_sign_bias[can_rf] !=
-                  cm->ref_frame_sign_bias[rf[cmp_idx]]) {
-                this_mv.as_mv.row = -this_mv.as_mv.row;
-                this_mv.as_mv.col = -this_mv.as_mv.col;
-              }
-              ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
-              ++ref_diff_count[cmp_idx];
-            }
-          }
-        }
-        idx += mi_size_high[candidate_bsize];
+        process_compound_ref_mv_candidate(
+            candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
+        idx += mi_size_high[candidate->sb_type];
       }
 
       // Build up the compound mv predictor
@@ -743,87 +760,37 @@ static void setup_ref_mv_list(
 
     for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
       clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
-                   xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
+                   xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
       clamp_mv_ref(&ref_mv_stack[ref_frame][idx].comp_mv.as_mv,
-                   xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
+                   xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
     }
   } else {
     // Handle single reference frame extension
-    int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n8_w);
+    int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w);
     mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
-    int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n8_h);
+    int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h);
     mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
     int mi_size = AOMMIN(mi_width, mi_height);
 
     for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size &&
                       refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
       const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
-      const int candidate_bsize = candidate->sb_type;
-
-      // TODO(jingning): Refactor the following code.
-      for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
-        if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
-          int_mv this_mv = candidate->mv[rf_idx];
-          if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
-              cm->ref_frame_sign_bias[ref_frame]) {
-            this_mv.as_mv.row = -this_mv.as_mv.row;
-            this_mv.as_mv.col = -this_mv.as_mv.col;
-          }
-          int stack_idx;
-          for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
-            int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
-            if (this_mv.as_int == stack_mv.as_int) break;
-          }
-
-          if (stack_idx == refmv_count[ref_frame]) {
-            ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
-
-            // TODO(jingning): Set an arbitrary small number here. The weight
-            // doesn't matter as long as it is properly initialized.
-            ref_mv_stack[ref_frame][stack_idx].weight = 2;
-            ++refmv_count[ref_frame];
-          }
-        }
-      }
-      idx += mi_size_wide[candidate_bsize];
+      process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
+                                      ref_mv_stack);
+      idx += mi_size_wide[candidate->sb_type];
     }
 
     for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size &&
                       refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
       const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
-      const int candidate_bsize = candidate->sb_type;
-
-      // TODO(jingning): Refactor the following code.
-      for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
-        if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
-          int_mv this_mv = candidate->mv[rf_idx];
-          if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
-              cm->ref_frame_sign_bias[ref_frame]) {
-            this_mv.as_mv.row = -this_mv.as_mv.row;
-            this_mv.as_mv.col = -this_mv.as_mv.col;
-          }
-          int stack_idx;
-          for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
-            int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
-            if (this_mv.as_int == stack_mv.as_int) break;
-          }
-
-          if (stack_idx == refmv_count[ref_frame]) {
-            ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
-
-            // TODO(jingning): Set an arbitrary small number here. The weight
-            // doesn't matter as long as it is properly initialized.
-            ref_mv_stack[ref_frame][stack_idx].weight = 2;
-            ++refmv_count[ref_frame];
-          }
-        }
-      }
-      idx += mi_size_high[candidate_bsize];
+      process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
+                                      ref_mv_stack);
+      idx += mi_size_high[candidate->sb_type];
     }
 
     for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
       clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
-                   xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
+                   xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
     }
 
     if (mv_ref_list != NULL) {
@@ -936,8 +903,10 @@ static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row,
   const int col_offset = (mv.col >= 0) ? (mv.col >> (4 + MI_SIZE_LOG2))
                                        : -((-mv.col) >> (4 + MI_SIZE_LOG2));
 
-  int row = (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset;
-  int col = (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
+  const int row =
+      (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset;
+  const int col =
+      (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
 
   if (row < 0 || row >= (cm->mi_rows >> 1) || col < 0 ||
       col >= (cm->mi_cols >> 1))
@@ -955,37 +924,44 @@ static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row,
   return 1;
 }
 
-static int motion_field_projection(AV1_COMMON *cm, MV_REFERENCE_FRAME ref_frame,
-                                   int dir) {
+// Note: motion_filed_projection finds motion vectors of current frame's
+// reference frame, and projects them to current frame. To make it clear,
+// let's call current frame's reference frame as start frame.
+// Call Start frame's reference frames as reference frames.
+// Call ref_offset as frame distances between start frame and its reference
+// frames.
+static int motion_field_projection(AV1_COMMON *cm,
+                                   MV_REFERENCE_FRAME start_frame, int dir) {
   TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
   int ref_offset[REF_FRAMES] = { 0 };
 
   (void)dir;
 
-  int ref_frame_idx = cm->frame_refs[FWD_RF_OFFSET(ref_frame)].idx;
-  if (ref_frame_idx < 0) return 0;
+  const int start_frame_idx = cm->frame_refs[FWD_RF_OFFSET(start_frame)].idx;
+  if (start_frame_idx < 0) return 0;
 
-  if (cm->buffer_pool->frame_bufs[ref_frame_idx].intra_only) return 0;
+  if (cm->buffer_pool->frame_bufs[start_frame_idx].intra_only) return 0;
 
-  if (cm->buffer_pool->frame_bufs[ref_frame_idx].mi_rows != cm->mi_rows ||
-      cm->buffer_pool->frame_bufs[ref_frame_idx].mi_cols != cm->mi_cols)
+  if (cm->buffer_pool->frame_bufs[start_frame_idx].mi_rows != cm->mi_rows ||
+      cm->buffer_pool->frame_bufs[start_frame_idx].mi_cols != cm->mi_cols)
     return 0;
 
-  int ref_frame_index =
-      cm->buffer_pool->frame_bufs[ref_frame_idx].cur_frame_offset;
-  unsigned int *ref_rf_idx =
-      &cm->buffer_pool->frame_bufs[ref_frame_idx].ref_frame_offset[0];
-  int cur_frame_index = cm->cur_frame->cur_frame_offset;
-  int ref_to_cur = get_relative_dist(cm, ref_frame_index, cur_frame_index);
+  const int start_frame_offset =
+      cm->buffer_pool->frame_bufs[start_frame_idx].cur_frame_offset;
+  const unsigned int *const ref_frame_offsets =
+      &cm->buffer_pool->frame_bufs[start_frame_idx].ref_frame_offset[0];
+  const int cur_frame_offset = cm->cur_frame->cur_frame_offset;
+  int start_to_current_frame_offset =
+      get_relative_dist(cm, start_frame_offset, cur_frame_offset);
 
   for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) {
-    ref_offset[rf] =
-        get_relative_dist(cm, ref_frame_index, ref_rf_idx[rf - LAST_FRAME]);
+    ref_offset[rf] = get_relative_dist(cm, start_frame_offset,
+                                       ref_frame_offsets[rf - LAST_FRAME]);
   }
 
-  if (dir == 2) ref_to_cur = -ref_to_cur;
+  if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset;
 
-  MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[ref_frame_idx].mvs;
+  MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[start_frame_idx].mvs;
   const int mvs_rows = (cm->mi_rows + 1) >> 1;
   const int mvs_cols = (cm->mi_cols + 1) >> 1;
 
@@ -999,19 +975,20 @@ static int motion_field_projection(AV1_COMMON *cm, MV_REFERENCE_FRAME ref_frame,
         int mi_r, mi_c;
         const int ref_frame_offset = ref_offset[mv_ref->ref_frame];
 
-        int pos_valid = abs(ref_frame_offset) <= MAX_FRAME_DISTANCE &&
-                        ref_frame_offset > 0 &&
-                        abs(ref_to_cur) <= MAX_FRAME_DISTANCE;
+        int pos_valid =
+            abs(ref_frame_offset) <= MAX_FRAME_DISTANCE &&
+            ref_frame_offset > 0 &&
+            abs(start_to_current_frame_offset) <= MAX_FRAME_DISTANCE;
 
         if (pos_valid) {
-          get_mv_projection(&this_mv.as_mv, fwd_mv, ref_to_cur,
-                            ref_frame_offset);
+          get_mv_projection(&this_mv.as_mv, fwd_mv,
+                            start_to_current_frame_offset, ref_frame_offset);
           pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
                                          this_mv.as_mv, dir >> 1);
         }
 
         if (pos_valid) {
-          int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c;
+          const int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c;
 
           tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row;
           tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col;
@@ -1167,14 +1144,14 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
   if (up_available) {
     int mi_row_offset = -1;
     MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * xd->mi_stride];
-    uint8_t n8_w = mi_size_wide[mbmi->sb_type];
+    uint8_t n4_w = mi_size_wide[mbmi->sb_type];
 
-    if (xd->n8_w <= n8_w) {
+    if (xd->n4_w <= n4_w) {
       // Handle "current block width <= above block width" case.
-      int col_offset = -mi_col % n8_w;
+      int col_offset = -mi_col % n4_w;
 
       if (col_offset < 0) do_tl = 0;
-      if (col_offset + n8_w > xd->n8_w) do_tr = 0;
+      if (col_offset + n4_w > xd->n4_w) do_tr = 0;
 
       if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
         record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1);
@@ -1185,11 +1162,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
       }
     } else {
       // Handle "current block width > above block width" case.
-      for (i = 0; i < AOMMIN(xd->n8_w, cm->mi_cols - mi_col); i += mi_step) {
+      for (i = 0; i < AOMMIN(xd->n4_w, cm->mi_cols - mi_col); i += mi_step) {
         int mi_col_offset = i;
         mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-        n8_w = mi_size_wide[mbmi->sb_type];
-        mi_step = AOMMIN(xd->n8_w, n8_w);
+        n4_w = mi_size_wide[mbmi->sb_type];
+        mi_step = AOMMIN(xd->n4_w, n4_w);
 
         if (mbmi->ref_frame[0] == ref_frame &&
             mbmi->ref_frame[1] == NONE_FRAME) {
@@ -1209,11 +1186,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
     int mi_col_offset = -1;
 
     MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
-    uint8_t n8_h = mi_size_high[mbmi->sb_type];
+    uint8_t n4_h = mi_size_high[mbmi->sb_type];
 
-    if (xd->n8_h <= n8_h) {
+    if (xd->n4_h <= n4_h) {
       // Handle "current block height <= above block height" case.
-      int row_offset = -mi_row % n8_h;
+      int row_offset = -mi_row % n4_h;
 
       if (row_offset < 0) do_tl = 0;
 
@@ -1226,11 +1203,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
       }
     } else {
       // Handle "current block height > above block height" case.
-      for (i = 0; i < AOMMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
+      for (i = 0; i < AOMMIN(xd->n4_h, cm->mi_rows - mi_row); i += mi_step) {
         int mi_row_offset = i;
         mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-        n8_h = mi_size_high[mbmi->sb_type];
-        mi_step = AOMMIN(xd->n8_h, n8_h);
+        n4_h = mi_size_high[mbmi->sb_type];
+        mi_step = AOMMIN(xd->n4_h, n4_h);
 
         if (mbmi->ref_frame[0] == ref_frame &&
             mbmi->ref_frame[1] == NONE_FRAME) {
@@ -1264,18 +1241,18 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
 
   // Top-right block
   if (do_tr &&
-      has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n8_w, xd->n8_h))) {
-    POSITION trb_pos = { -1, xd->n8_w };
+      has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n4_w, xd->n4_h))) {
+    POSITION trb_pos = { -1, xd->n4_w };
 
-    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &trb_pos)) {
+    if (is_inside(tile, mi_col, mi_row, &trb_pos)) {
       int mi_row_offset = -1;
-      int mi_col_offset = xd->n8_w;
+      int mi_col_offset = xd->n4_w;
 
       MB_MODE_INFO *mbmi =
           xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
 
       if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
-        record_samples(mbmi, pts, pts_inref, 0, -1, xd->n8_w, 1);
+        record_samples(mbmi, pts, pts_inref, 0, -1, xd->n4_w, 1);
         np++;
         if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
       }
@@ -1372,7 +1349,7 @@ static int compare_ref_frame_info(const void *arg_a, const void *arg_b) {
 
 static void set_ref_frame_info(AV1_COMMON *const cm, int frame_idx,
                                REF_FRAME_INFO *ref_info) {
-  assert(frame_idx >= 0 && frame_idx <= INTER_REFS_PER_FRAME);
+  assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME);
 
   const int buf_idx = ref_info->buf_idx;
 
diff --git a/third_party/aom/av1/common/mvref_common.h b/third_party/aom/av1/common/mvref_common.h
index f68c159e1..83f7a1ac0 100644
--- a/third_party/aom/av1/common/mvref_common.h
+++ b/third_party/aom/av1/common/mvref_common.h
@@ -8,8 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AV1_COMMON_MVREF_COMMON_H_
-#define AV1_COMMON_MVREF_COMMON_H_
+#ifndef AOM_AV1_COMMON_MVREF_COMMON_H_
+#define AOM_AV1_COMMON_MVREF_COMMON_H_
 
 #include "av1/common/onyxc_int.h"
 #include "av1/common/blockd.h"
@@ -85,29 +85,17 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
 // Checks that the given mi_row, mi_col and search point
 // are inside the borders of the tile.
 static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
-                            int mi_rows, const POSITION *mi_pos) {
-  const int dependent_horz_tile_flag = 0;
-  if (dependent_horz_tile_flag && !tile->tg_horz_boundary) {
-    return !(mi_row + mi_pos->row < 0 ||
-             mi_col + mi_pos->col < tile->mi_col_start ||
-             mi_row + mi_pos->row >= mi_rows ||
-             mi_col + mi_pos->col >= tile->mi_col_end);
-  } else {
-    return !(mi_row + mi_pos->row < tile->mi_row_start ||
-             mi_col + mi_pos->col < tile->mi_col_start ||
-             mi_row + mi_pos->row >= tile->mi_row_end ||
-             mi_col + mi_pos->col >= tile->mi_col_end);
-  }
+                            const POSITION *mi_pos) {
+  return !(mi_row + mi_pos->row < tile->mi_row_start ||
+           mi_col + mi_pos->col < tile->mi_col_start ||
+           mi_row + mi_pos->row >= tile->mi_row_end ||
+           mi_col + mi_pos->col >= tile->mi_col_end);
 }
 
 static INLINE int find_valid_row_offset(const TileInfo *const tile, int mi_row,
-                                        int mi_rows, int row_offset) {
-  const int dependent_horz_tile_flag = 0;
-  if (dependent_horz_tile_flag && !tile->tg_horz_boundary)
-    return clamp(row_offset, -mi_row, mi_rows - mi_row - 1);
-  else
-    return clamp(row_offset, tile->mi_row_start - mi_row,
-                 tile->mi_row_end - mi_row - 1);
+                                        int row_offset) {
+  return clamp(row_offset, tile->mi_row_start - mi_row,
+               tile->mi_row_end - mi_row - 1);
 }
 
 static INLINE int find_valid_col_offset(const TileInfo *const tile, int mi_col,
@@ -263,8 +251,9 @@ static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) {
   }
 }
 
-void av1_copy_frame_mvs(const AV1_COMMON *const cm, MB_MODE_INFO *mi,
-                        int mi_row, int mi_col, int x_mis, int y_mis);
+void av1_copy_frame_mvs(const AV1_COMMON *const cm,
+                        const MB_MODE_INFO *const mi, int mi_row, int mi_col,
+                        int x_mis, int y_mis);
 
 void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                       MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
@@ -286,7 +275,6 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
 
 #define INTRABC_DELAY_PIXELS 256  //  Delay of 256 pixels
 #define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
-#define USE_WAVE_FRONT 1  // Use only top left area of frame for reference.
 
 static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile,
                                    int mib_size, int mi_row, int mi_col) {
@@ -356,13 +344,12 @@ static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
   const int src_sb64 = src_sb_row * total_sb64_per_row + src_sb64_col;
   if (src_sb64 >= active_sb64 - INTRABC_DELAY_SB64) return 0;
 
-#if USE_WAVE_FRONT
+  // Wavefront constraint: use only top left area of frame for reference.
   const int gradient = 1 + INTRABC_DELAY_SB64 + (sb_size > 64);
   const int wf_offset = gradient * (active_sb_row - src_sb_row);
   if (src_sb_row > active_sb_row ||
       src_sb64_col >= active_sb64_col - INTRABC_DELAY_SB64 + wf_offset)
     return 0;
-#endif
 
   return 1;
 }
@@ -371,4 +358,4 @@ static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_MVREF_COMMON_H_
+#endif  // AOM_AV1_COMMON_MVREF_COMMON_H_
diff --git a/third_party/aom/av1/common/obmc.h b/third_party/aom/av1/common/obmc.h
index 3918c82c6..1c90cd93f 100644
--- a/third_party/aom/av1/common/obmc.h
+++ b/third_party/aom/av1/common/obmc.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_OBMC_H_
-#define AV1_COMMON_OBMC_H_
+#ifndef AOM_AV1_COMMON_OBMC_H_
+#define AOM_AV1_COMMON_OBMC_H_
 
 typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_pos,
                                           uint8_t nb_mi_size,
@@ -30,7 +30,7 @@ static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
   // prev_row_mi points into the mi array, starting at the beginning of the
   // previous row.
   MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
-  const int end_col = AOMMIN(mi_col + xd->n8_w, cm->mi_cols);
+  const int end_col = AOMMIN(mi_col + xd->n4_w, cm->mi_cols);
   uint8_t mi_step;
   for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max;
        above_mi_col += mi_step) {
@@ -49,7 +49,7 @@ static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
     }
     if (is_neighbor_overlappable(*above_mi)) {
       ++nb_count;
-      fun(xd, above_mi_col - mi_col, AOMMIN(xd->n8_w, mi_step), *above_mi,
+      fun(xd, above_mi_col - mi_col, AOMMIN(xd->n4_w, mi_step), *above_mi,
           fun_ctxt, num_planes);
     }
   }
@@ -68,7 +68,7 @@ static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
   // prev_col_mi points into the mi array, starting at the top of the
   // previous column
   MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
-  const int end_row = AOMMIN(mi_row + xd->n8_h, cm->mi_rows);
+  const int end_row = AOMMIN(mi_row + xd->n4_h, cm->mi_rows);
   uint8_t mi_step;
   for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max;
        left_mi_row += mi_step) {
@@ -82,10 +82,10 @@ static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
     }
     if (is_neighbor_overlappable(*left_mi)) {
       ++nb_count;
-      fun(xd, left_mi_row - mi_row, AOMMIN(xd->n8_h, mi_step), *left_mi,
+      fun(xd, left_mi_row - mi_row, AOMMIN(xd->n4_h, mi_step), *left_mi,
           fun_ctxt, num_planes);
     }
   }
 }
 
-#endif  // AV1_COMMON_OBMC_H_
+#endif  // AOM_AV1_COMMON_OBMC_H_
diff --git a/third_party/aom/av1/common/obu_util.c b/third_party/aom/av1/common/obu_util.c
new file mode 100644
index 000000000..823b700b1
--- /dev/null
+++ b/third_party/aom/av1/common/obu_util.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/common/obu_util.h"
+
+#include "aom_dsp/bitreader_buffer.h"
+
+// Returns 1 when OBU type is valid, and 0 otherwise.
+static int valid_obu_type(int obu_type) {
+  int valid_type = 0;
+  switch (obu_type) {
+    case OBU_SEQUENCE_HEADER:
+    case OBU_TEMPORAL_DELIMITER:
+    case OBU_FRAME_HEADER:
+    case OBU_TILE_GROUP:
+    case OBU_METADATA:
+    case OBU_FRAME:
+    case OBU_REDUNDANT_FRAME_HEADER:
+    case OBU_TILE_LIST:
+    case OBU_PADDING: valid_type = 1; break;
+    default: break;
+  }
+  return valid_type;
+}
+
+static aom_codec_err_t read_obu_size(const uint8_t *data,
+                                     size_t bytes_available,
+                                     size_t *const obu_size,
+                                     size_t *const length_field_size) {
+  uint64_t u_obu_size = 0;
+  if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) !=
+      0) {
+    return AOM_CODEC_CORRUPT_FRAME;
+  }
+
+  if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME;
+  *obu_size = (size_t)u_obu_size;
+  return AOM_CODEC_OK;
+}
+
+// Parses OBU header and stores values in 'header'.
+static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb,
+                                       int is_annexb, ObuHeader *header) {
+  if (!rb || !header) return AOM_CODEC_INVALID_PARAM;
+
+  const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer;
+  if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME;
+
+  header->size = 1;
+
+  if (aom_rb_read_bit(rb) != 0) {
+    // Forbidden bit. Must not be set.
+    return AOM_CODEC_CORRUPT_FRAME;
+  }
+
+  header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4);
+
+  if (!valid_obu_type(header->type)) return AOM_CODEC_CORRUPT_FRAME;
+
+  header->has_extension = aom_rb_read_bit(rb);
+  header->has_size_field = aom_rb_read_bit(rb);
+
+  if (!header->has_size_field && !is_annexb) {
+    // section 5 obu streams must have obu_size field set.
+    return AOM_CODEC_UNSUP_BITSTREAM;
+  }
+
+  if (aom_rb_read_bit(rb) != 0) {
+    // obu_reserved_1bit must be set to 0.
+    return AOM_CODEC_CORRUPT_FRAME;
+  }
+
+  if (header->has_extension) {
+    if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME;
+
+    header->size += 1;
+    header->temporal_layer_id = aom_rb_read_literal(rb, 3);
+    header->spatial_layer_id = aom_rb_read_literal(rb, 2);
+    if (aom_rb_read_literal(rb, 3) != 0) {
+      // extension_header_reserved_3bits must be set to 0.
+      return AOM_CODEC_CORRUPT_FRAME;
+    }
+  }
+
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+                                    size_t *consumed, ObuHeader *header,
+                                    int is_annexb) {
+  if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM;
+
+  // TODO(tomfinegan): Set the error handler here and throughout this file, and
+  // confirm parsing work done via aom_read_bit_buffer is successful.
+  struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL,
+                                    NULL };
+  aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header);
+  if (parse_result == AOM_CODEC_OK) *consumed = header->size;
+  return parse_result;
+}
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+                                             size_t bytes_available,
+                                             int is_annexb,
+                                             ObuHeader *obu_header,
+                                             size_t *const payload_size,
+                                             size_t *const bytes_read) {
+  size_t length_field_size = 0, obu_size = 0;
+  aom_codec_err_t status;
+
+  if (is_annexb) {
+    // Size field comes before the OBU header, and includes the OBU header
+    status =
+        read_obu_size(data, bytes_available, &obu_size, &length_field_size);
+
+    if (status != AOM_CODEC_OK) return status;
+  }
+
+  struct aom_read_bit_buffer rb = { data + length_field_size,
+                                    data + bytes_available, 0, NULL, NULL };
+
+  status = read_obu_header(&rb, is_annexb, obu_header);
+  if (status != AOM_CODEC_OK) return status;
+
+  if (is_annexb) {
+    // Derive the payload size from the data we've already read
+    if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME;
+
+    *payload_size = obu_size - obu_header->size;
+  } else {
+    // Size field comes after the OBU header, and is just the payload size
+    status = read_obu_size(data + obu_header->size,
+                           bytes_available - obu_header->size, payload_size,
+                           &length_field_size);
+    if (status != AOM_CODEC_OK) return status;
+  }
+
+  *bytes_read = length_field_size + obu_header->size;
+  return AOM_CODEC_OK;
+}
diff --git a/third_party/aom/av1/common/obu_util.h b/third_party/aom/av1/common/obu_util.h
new file mode 100644
index 000000000..7c56904c8
--- /dev/null
+++ b/third_party/aom/av1/common/obu_util.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_OBU_UTIL_H_
+#define AOM_AV1_COMMON_OBU_UTIL_H_
+
+#include "aom/aom_codec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  size_t size;  // Size (1 or 2 bytes) of the OBU header (including the
+                // optional OBU extension header) in the bitstream.
+  OBU_TYPE type;
+  int has_size_field;
+  int has_extension;
+  // The following fields come from the OBU extension header and therefore are
+  // only used if has_extension is true.
+  int temporal_layer_id;
+  int spatial_layer_id;
+} ObuHeader;
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+                                    size_t *consumed, ObuHeader *header,
+                                    int is_annexb);
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+                                             size_t bytes_available,
+                                             int is_annexb,
+                                             ObuHeader *obu_header,
+                                             size_t *const payload_size,
+                                             size_t *const bytes_read);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_OBU_UTIL_H_
diff --git a/third_party/aom/av1/common/odintrin.h b/third_party/aom/av1/common/odintrin.h
index e87c5a0bf..e1db0f44d 100644
--- a/third_party/aom/av1/common/odintrin.h
+++ b/third_party/aom/av1/common/odintrin.h
@@ -11,8 +11,8 @@
 
 /* clang-format off */
 
-#ifndef AV1_COMMON_ODINTRIN_H_
-#define AV1_COMMON_ODINTRIN_H_
+#ifndef AOM_AV1_COMMON_ODINTRIN_H_
+#define AOM_AV1_COMMON_ODINTRIN_H_
 
 #include <stdlib.h>
 #include <string.h>
@@ -46,9 +46,9 @@ extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
 #define OD_MAXI AOMMAX
 #define OD_CLAMPI(min, val, max) (OD_MAXI(min, OD_MINI(val, max)))
 
-#define OD_CLZ0 (1)
-#define OD_CLZ(x) (-get_msb(x))
-#define OD_ILOG_NZ(x) (OD_CLZ0 - OD_CLZ(x))
+/*Integer logarithm (base 2) of a nonzero unsigned 32-bit integer.
+  OD_ILOG_NZ(x) = (int)floor(log2(x)) + 1.*/
+#define OD_ILOG_NZ(x) (1 + get_msb(x))
 
 /*Enable special features for gcc and compatible compilers.*/
 #if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
@@ -93,4 +93,4 @@ extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_ODINTRIN_H_
+#endif  // AOM_AV1_COMMON_ODINTRIN_H_
diff --git a/third_party/aom/av1/common/onyxc_int.h b/third_party/aom/av1/common/onyxc_int.h
index 6b1bf2d74..ff011c89e 100644
--- a/third_party/aom/av1/common/onyxc_int.h
+++ b/third_party/aom/av1/common/onyxc_int.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_ONYXC_INT_H_
-#define AV1_COMMON_ONYXC_INT_H_
+#ifndef AOM_AV1_COMMON_ONYXC_INT_H_
+#define AOM_AV1_COMMON_ONYXC_INT_H_
 
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
@@ -480,6 +480,7 @@ typedef struct AV1Common {
 
   int byte_alignment;
   int skip_loop_filter;
+  int skip_film_grain;
 
   // Private data associated with the frame buffer callbacks.
   void *cb_priv;
@@ -823,18 +824,18 @@ static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
     xd->chroma_left_mbmi = chroma_left_mi;
   }
 
-  xd->n8_h = bh;
-  xd->n8_w = bw;
+  xd->n4_h = bh;
+  xd->n4_w = bw;
   xd->is_sec_rect = 0;
-  if (xd->n8_w < xd->n8_h) {
+  if (xd->n4_w < xd->n4_h) {
     // Only mark is_sec_rect as 1 for the last block.
     // For PARTITION_VERT_4, it would be (0, 0, 0, 1);
     // For other partitions, it would be (0, 1).
-    if (!((mi_col + xd->n8_w) & (xd->n8_h - 1))) xd->is_sec_rect = 1;
+    if (!((mi_col + xd->n4_w) & (xd->n4_h - 1))) xd->is_sec_rect = 1;
   }
 
-  if (xd->n8_w > xd->n8_h)
-    if (mi_row & (xd->n8_w - 1)) xd->is_sec_rect = 1;
+  if (xd->n4_w > xd->n4_h)
+    if (mi_row & (xd->n4_w - 1)) xd->is_sec_rect = 1;
 }
 
 static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
@@ -1115,18 +1116,18 @@ static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
   for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
 }
 
-static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n8_w, int n8_h, int skip,
+static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip,
                                  const MACROBLOCKD *xd) {
   uint8_t bw = tx_size_wide[tx_size];
   uint8_t bh = tx_size_high[tx_size];
 
   if (skip) {
-    bw = n8_w * MI_SIZE;
-    bh = n8_h * MI_SIZE;
+    bw = n4_w * MI_SIZE;
+    bh = n4_h * MI_SIZE;
   }
 
-  set_txfm_ctx(xd->above_txfm_context, bw, n8_w);
-  set_txfm_ctx(xd->left_txfm_context, bh, n8_h);
+  set_txfm_ctx(xd->above_txfm_context, bw, n4_w);
+  set_txfm_ctx(xd->left_txfm_context, bh, n4_h);
 }
 
 static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
@@ -1338,4 +1339,4 @@ static INLINE uint8_t major_minor_to_seq_level_idx(BitstreamLevel bl) {
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_ONYXC_INT_H_
+#endif  // AOM_AV1_COMMON_ONYXC_INT_H_
diff --git a/third_party/aom/av1/common/ppc/cfl_ppc.c b/third_party/aom/av1/common/ppc/cfl_ppc.c
index 58933a7b3..026a07809 100644
--- a/third_party/aom/av1/common/ppc/cfl_ppc.c
+++ b/third_party/aom/av1/common/ppc/cfl_ppc.c
@@ -24,19 +24,21 @@
 #define CFL_LINE_2 128
 #define CFL_LINE_3 192
 
-typedef vector int8_t int8x16_t;
-typedef vector uint8_t uint8x16_t;
-typedef vector int16_t int16x8_t;
-typedef vector uint16_t uint16x8_t;
-typedef vector int32_t int32x4_t;
-typedef vector uint32_t uint32x4_t;
-typedef vector uint64_t uint64x2_t;
+typedef vector signed char int8x16_t;          // NOLINT(runtime/int)
+typedef vector unsigned char uint8x16_t;       // NOLINT(runtime/int)
+typedef vector signed short int16x8_t;         // NOLINT(runtime/int)
+typedef vector unsigned short uint16x8_t;      // NOLINT(runtime/int)
+typedef vector signed int int32x4_t;           // NOLINT(runtime/int)
+typedef vector unsigned int uint32x4_t;        // NOLINT(runtime/int)
+typedef vector unsigned long long uint64x2_t;  // NOLINT(runtime/int)
 
-static INLINE void subtract_average_vsx(int16_t *pred_buf, int width,
-                                        int height, int round_offset,
+static INLINE void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst,
+                                        int width, int height, int round_offset,
                                         int num_pel_log2) {
-  const int16_t *end = pred_buf + height * CFL_BUF_LINE;
-  const int16_t *sum_buf = pred_buf;
+  //  int16_t *dst = dst_ptr;
+  const int16_t *dst_end = dst + height * CFL_BUF_LINE;
+  const int16_t *sum_buf = (const int16_t *)src_ptr;
+  const int16_t *end = sum_buf + height * CFL_BUF_LINE;
   const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2);
   const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
                                0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
@@ -71,43 +73,40 @@ static INLINE void subtract_average_vsx(int16_t *pred_buf, int width,
   const int32x4_t avg = vec_sr(sum_32x4, div_shift);
   const int16x8_t vec_avg = vec_pack(avg, avg);
   do {
-    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, pred_buf), vec_avg), OFF_0, pred_buf);
-    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, pred_buf), vec_avg),
-               OFF_0 + CFL_BUF_LINE_BYTES, pred_buf);
-    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, pred_buf), vec_avg),
-               OFF_0 + CFL_LINE_2, pred_buf);
-    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, pred_buf), vec_avg),
-               OFF_0 + CFL_LINE_3, pred_buf);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, dst), vec_avg), OFF_0, dst);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, dst), vec_avg),
+               OFF_0 + CFL_BUF_LINE_BYTES, dst);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, dst), vec_avg),
+               OFF_0 + CFL_LINE_2, dst);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, dst), vec_avg),
+               OFF_0 + CFL_LINE_3, dst);
     if (width >= 16) {
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, pred_buf), vec_avg), OFF_1,
-                 pred_buf);
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, pred_buf), vec_avg),
-                 OFF_1 + CFL_LINE_1, pred_buf);
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, pred_buf), vec_avg),
-                 OFF_1 + CFL_LINE_2, pred_buf);
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, pred_buf), vec_avg),
-                 OFF_1 + CFL_LINE_3, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, dst), vec_avg), OFF_1, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, dst), vec_avg),
+                 OFF_1 + CFL_LINE_1, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, dst), vec_avg),
+                 OFF_1 + CFL_LINE_2, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, dst), vec_avg),
+                 OFF_1 + CFL_LINE_3, dst);
     }
     if (width == 32) {
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, pred_buf), vec_avg), OFF_2,
-                 pred_buf);
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, pred_buf), vec_avg),
-                 OFF_2 + CFL_LINE_1, pred_buf);
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, pred_buf), vec_avg),
-                 OFF_2 + CFL_LINE_2, pred_buf);
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, pred_buf), vec_avg),
-                 OFF_2 + CFL_LINE_3, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, dst), vec_avg), OFF_2, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, dst), vec_avg),
+                 OFF_2 + CFL_LINE_1, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, dst), vec_avg),
+                 OFF_2 + CFL_LINE_2, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, dst), vec_avg),
+                 OFF_2 + CFL_LINE_3, dst);
 
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, pred_buf), vec_avg), OFF_3,
-                 pred_buf);
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, pred_buf), vec_avg),
-                 OFF_3 + CFL_LINE_1, pred_buf);
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, pred_buf), vec_avg),
-                 OFF_3 + CFL_LINE_2, pred_buf);
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, pred_buf), vec_avg),
-                 OFF_3 + CFL_LINE_3, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, dst), vec_avg), OFF_3, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, dst), vec_avg),
+                 OFF_3 + CFL_LINE_1, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, dst), vec_avg),
+                 OFF_3 + CFL_LINE_2, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, dst), vec_avg),
+                 OFF_3 + CFL_LINE_3, dst);
     }
-  } while ((pred_buf += CFL_BUF_LINE * 4) < end);
+  } while ((dst += CFL_BUF_LINE * 4) < dst_end);
 }
 
 // Declare wrappers for VSX sizes
diff --git a/third_party/aom/av1/common/pred_common.c b/third_party/aom/av1/common/pred_common.c
index d77739d85..5952441d1 100644
--- a/third_party/aom/av1/common/pred_common.c
+++ b/third_party/aom/av1/common/pred_common.c
@@ -31,8 +31,8 @@ int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const int ctx_offset =
       (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET;
-  MV_REFERENCE_FRAME ref_frame =
-      (dir < 2) ? mbmi->ref_frame[0] : mbmi->ref_frame[1];
+  assert(dir == 0 || dir == 1);
+  const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries corresponding to real macroblocks.
diff --git a/third_party/aom/av1/common/pred_common.h b/third_party/aom/av1/common/pred_common.h
index 6a835c467..6dba2322d 100644
--- a/third_party/aom/av1/common/pred_common.h
+++ b/third_party/aom/av1/common/pred_common.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_PRED_COMMON_H_
-#define AV1_COMMON_PRED_COMMON_H_
+#ifndef AOM_AV1_COMMON_PRED_COMMON_H_
+#define AOM_AV1_COMMON_PRED_COMMON_H_
 
 #include "av1/common/blockd.h"
 #include "av1/common/mvref_common.h"
@@ -357,4 +357,4 @@ static INLINE int get_tx_size_context(const MACROBLOCKD *xd) {
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_PRED_COMMON_H_
+#endif  // AOM_AV1_COMMON_PRED_COMMON_H_
diff --git a/third_party/aom/av1/common/quant_common.h b/third_party/aom/av1/common/quant_common.h
index ca199e94c..d1f52a660 100644
--- a/third_party/aom/av1/common/quant_common.h
+++ b/third_party/aom/av1/common/quant_common.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_QUANT_COMMON_H_
-#define AV1_COMMON_QUANT_COMMON_H_
+#ifndef AOM_AV1_COMMON_QUANT_COMMON_H_
+#define AOM_AV1_COMMON_QUANT_COMMON_H_
 
 #include "aom/aom_codec.h"
 #include "av1/common/seg_common.h"
@@ -60,4 +60,4 @@ const qm_val_t *av1_qmatrix(struct AV1Common *cm, int qindex, int comp,
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_QUANT_COMMON_H_
+#endif  // AOM_AV1_COMMON_QUANT_COMMON_H_
diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c
index b9f0b57f3..3203efce4 100644
--- a/third_party/aom/av1/common/reconinter.c
+++ b/third_party/aom/av1/common/reconinter.c
@@ -44,10 +44,9 @@ int av1_allow_warp(const MB_MODE_INFO *const mbmi,
 
   if (build_for_obmc) return 0;
 
-  if (warp_types->local_warp_allowed && !mbmi->wm_params[0].invalid) {
+  if (warp_types->local_warp_allowed && !mbmi->wm_params.invalid) {
     if (final_warp_params != NULL)
-      memcpy(final_warp_params, &mbmi->wm_params[0],
-             sizeof(*final_warp_params));
+      memcpy(final_warp_params, &mbmi->wm_params, sizeof(*final_warp_params));
     return 1;
   } else if (warp_types->global_warp_allowed && !gm_params->invalid) {
     if (final_warp_params != NULL)
@@ -78,6 +77,9 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
        av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]],
                       build_for_obmc, subpel_params->xs, subpel_params->ys,
                       &final_warp_params));
+  const int is_intrabc = mi->use_intrabc;
+  assert(IMPLIES(is_intrabc, !do_warp));
+
   if (do_warp && xd->cur_frame_force_integer_mv == 0) {
     const struct macroblockd_plane *const pd = &xd->plane[plane];
     const struct buf_2d *const pre_buf = &pd->pre[ref];
@@ -88,10 +90,11 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
                    pd->subsampling_x, pd->subsampling_y, conv_params);
   } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf,
-                           w, h, conv_params, interp_filters, xd->bd);
+                           w, h, conv_params, interp_filters, is_intrabc,
+                           xd->bd);
   } else {
     inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf, w, h,
-                    conv_params, interp_filters);
+                    conv_params, interp_filters, is_intrabc);
   }
 }
 
@@ -574,37 +577,6 @@ static void build_masked_compound_no_round(
                                  h, subw, subh, conv_params);
 }
 
-static void build_masked_compound(
-    uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
-    const uint8_t *src1, int src1_stride,
-    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
-    int w) {
-  // Derive subsampling from h and w passed in. May be refactored to
-  // pass in subsampling factors directly.
-  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
-  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
-  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
-  aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                     mask, block_size_wide[sb_type], w, h, subw, subh);
-}
-
-static void build_masked_compound_highbd(
-    uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
-    const uint8_t *src1_8, int src1_stride,
-    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
-    int w, int bd) {
-  // Derive subsampling from h and w passed in. May be refactored to
-  // pass in subsampling factors directly.
-  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
-  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
-  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
-  // const uint8_t *mask =
-  //     av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
-  aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                            src1_stride, mask, block_size_wide[sb_type], w, h,
-                            subw, subh, bd);
-}
-
 void av1_make_masked_inter_predictor(
     const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride,
     const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
@@ -653,63 +625,6 @@ void av1_make_masked_inter_predictor(
                                  mi->sb_type, h, w, conv_params, xd);
 }
 
-// TODO(sarahparker) av1_highbd_build_inter_predictor and
-// av1_build_inter_predictor should be combined with
-// av1_make_inter_predictor
-void av1_highbd_build_inter_predictor(
-    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
-    const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref,
-    InterpFilters interp_filters, const WarpTypesAllowed *warp_types, int p_col,
-    int p_row, int plane, enum mv_precision precision, int x, int y,
-    const MACROBLOCKD *xd, int can_use_previous) {
-  const int is_q4 = precision == MV_PRECISION_Q4;
-  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
-                     is_q4 ? src_mv->col : src_mv->col * 2 };
-  MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
-  mv.col += SCALE_EXTRA_OFF;
-  mv.row += SCALE_EXTRA_OFF;
-  const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
-                                       mv.col & SCALE_SUBPEL_MASK,
-                                       mv.row & SCALE_SUBPEL_MASK };
-  ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd);
-
-  src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
-         (mv.col >> SCALE_SUBPEL_BITS);
-
-  av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf,
-                           w, h, &conv_params, interp_filters, warp_types,
-                           p_col, p_row, plane, ref, xd->mi[0], 0, xd,
-                           can_use_previous);
-}
-
-void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
-                               int dst_stride, const MV *src_mv,
-                               const struct scale_factors *sf, int w, int h,
-                               ConvolveParams *conv_params,
-                               InterpFilters interp_filters,
-                               const WarpTypesAllowed *warp_types, int p_col,
-                               int p_row, int plane, int ref,
-                               enum mv_precision precision, int x, int y,
-                               const MACROBLOCKD *xd, int can_use_previous) {
-  const int is_q4 = precision == MV_PRECISION_Q4;
-  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
-                     is_q4 ? src_mv->col : src_mv->col * 2 };
-  MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
-  mv.col += SCALE_EXTRA_OFF;
-  mv.row += SCALE_EXTRA_OFF;
-
-  const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
-                                       mv.col & SCALE_SUBPEL_MASK,
-                                       mv.row & SCALE_SUBPEL_MASK };
-  src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
-         (mv.col >> SCALE_SUBPEL_BITS);
-
-  av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf,
-                           w, h, conv_params, interp_filters, warp_types, p_col,
-                           p_row, plane, ref, xd->mi[0], 0, xd,
-                           can_use_previous);
-}
-
 void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
                                 int order_idx, int *fwd_offset, int *bck_offset,
                                 int *use_jnt_comp_avg, int is_compound) {
@@ -759,279 +674,6 @@ void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
   *bck_offset = quant_dist_lookup_table[order_idx][i][1 - order];
 }
 
-static INLINE void calc_subpel_params(
-    MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv,
-    int plane, const int pre_x, const int pre_y, int x, int y,
-    struct buf_2d *const pre_buf, uint8_t **pre, SubpelParams *subpel_params,
-    int bw, int bh) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int is_scaled = av1_is_scaled(sf);
-  if (is_scaled) {
-    int ssx = pd->subsampling_x;
-    int ssy = pd->subsampling_y;
-    int orig_pos_y = (pre_y + y) << SUBPEL_BITS;
-    orig_pos_y += mv.row * (1 << (1 - ssy));
-    int orig_pos_x = (pre_x + x) << SUBPEL_BITS;
-    orig_pos_x += mv.col * (1 << (1 - ssx));
-    int pos_y = sf->scale_value_y(orig_pos_y, sf);
-    int pos_x = sf->scale_value_x(orig_pos_x, sf);
-    pos_x += SCALE_EXTRA_OFF;
-    pos_y += SCALE_EXTRA_OFF;
-
-    const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
-    const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
-    const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                       << SCALE_SUBPEL_BITS;
-    const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
-    pos_y = clamp(pos_y, top, bottom);
-    pos_x = clamp(pos_x, left, right);
-
-    *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-           (pos_x >> SCALE_SUBPEL_BITS);
-    subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
-    subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
-    subpel_params->xs = sf->x_step_q4;
-    subpel_params->ys = sf->y_step_q4;
-  } else {
-    const MV mv_q4 = clamp_mv_to_umv_border_sb(
-        xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
-    subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
-    subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
-    subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
-    *pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
-           (x + (mv_q4.col >> SUBPEL_BITS));
-  }
-}
-
-static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                          int plane, const MB_MODE_INFO *mi,
-                                          int build_for_obmc, int bw, int bh,
-                                          int mi_x, int mi_y) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  int is_compound = has_second_ref(mi);
-  int ref;
-  const int is_intrabc = is_intrabc_block(mi);
-  assert(IMPLIES(is_intrabc, !is_compound));
-  int is_global[2] = { 0, 0 };
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
-    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
-  }
-
-  const BLOCK_SIZE bsize = mi->sb_type;
-  const int ss_x = pd->subsampling_x;
-  const int ss_y = pd->subsampling_y;
-  int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) ||
-                     (block_size_high[bsize] < 8 && ss_y);
-
-  if (is_intrabc) sub8x8_inter = 0;
-
-  // For sub8x8 chroma blocks, we may be covering more than one luma block's
-  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
-  // the top-left corner of the prediction source - the correct top-left corner
-  // is at (pre_x, pre_y).
-  const int row_start =
-      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
-  const int col_start =
-      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
-  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
-  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
-
-  sub8x8_inter = sub8x8_inter && !build_for_obmc;
-  if (sub8x8_inter) {
-    for (int row = row_start; row <= 0 && sub8x8_inter; ++row) {
-      for (int col = col_start; col <= 0; ++col) {
-        const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
-        if (!is_inter_block(this_mbmi)) sub8x8_inter = 0;
-        if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0;
-      }
-    }
-  }
-
-  if (sub8x8_inter) {
-    // block size
-    const int b4_w = block_size_wide[bsize] >> ss_x;
-    const int b4_h = block_size_high[bsize] >> ss_y;
-    const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
-    const int b8_w = block_size_wide[plane_bsize] >> ss_x;
-    const int b8_h = block_size_high[plane_bsize] >> ss_y;
-    assert(!is_compound);
-
-    const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] };
-
-    int row = row_start;
-    for (int y = 0; y < b8_h; y += b4_h) {
-      int col = col_start;
-      for (int x = 0; x < b8_w; x += b4_w) {
-        MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
-        is_compound = has_second_ref(this_mbmi);
-        DECLARE_ALIGNED(32, CONV_BUF_TYPE, tmp_dst[8 * 8]);
-        int tmp_dst_stride = 8;
-        assert(bw < 8 || bh < 8);
-        ConvolveParams conv_params = get_conv_params_no_round(
-            0, 0, plane, tmp_dst, tmp_dst_stride, is_compound, xd->bd);
-        conv_params.use_jnt_comp_avg = 0;
-        struct buf_2d *const dst_buf = &pd->dst;
-        uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
-
-        ref = 0;
-        const RefBuffer *ref_buf =
-            &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
-
-        pd->pre[ref].buf0 =
-            (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer;
-        pd->pre[ref].buf =
-            pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
-                                                     ref_buf->buf->uv_stride,
-                                                     &ref_buf->sf);
-        pd->pre[ref].width = ref_buf->buf->uv_crop_width;
-        pd->pre[ref].height = ref_buf->buf->uv_crop_height;
-        pd->pre[ref].stride = ref_buf->buf->uv_stride;
-
-        const struct scale_factors *const sf =
-            is_intrabc ? &cm->sf_identity : &ref_buf->sf;
-        struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
-
-        const MV mv = this_mbmi->mv[ref].as_mv;
-
-        uint8_t *pre;
-        SubpelParams subpel_params;
-        WarpTypesAllowed warp_types;
-        warp_types.global_warp_allowed = is_global[ref];
-        warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL;
-
-        calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
-                           &subpel_params, bw, bh);
-
-        conv_params.ref = ref;
-        conv_params.do_average = ref;
-        if (is_masked_compound_type(mi->interinter_comp.type)) {
-          // masked compound type has its own average mechanism
-          conv_params.do_average = 0;
-        }
-
-        av1_make_inter_predictor(
-            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf,
-            b4_w, b4_h, &conv_params, this_mbmi->interp_filters, &warp_types,
-            (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y,
-            plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
-
-        ++col;
-      }
-      ++row;
-    }
-
-    for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref];
-    return;
-  }
-
-  {
-    DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
-    ConvolveParams conv_params = get_conv_params_no_round(
-        0, 0, plane, tmp_dst, MAX_SB_SIZE, is_compound, xd->bd);
-    av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
-                               &conv_params.bck_offset,
-                               &conv_params.use_jnt_comp_avg, is_compound);
-
-    struct buf_2d *const dst_buf = &pd->dst;
-    uint8_t *const dst = dst_buf->buf;
-    for (ref = 0; ref < 1 + is_compound; ++ref) {
-      const struct scale_factors *const sf =
-          is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
-      struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
-      const MV mv = mi->mv[ref].as_mv;
-
-      uint8_t *pre;
-      SubpelParams subpel_params;
-      calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf, &pre,
-                         &subpel_params, bw, bh);
-
-      WarpTypesAllowed warp_types;
-      warp_types.global_warp_allowed = is_global[ref];
-      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-      conv_params.ref = ref;
-
-      if (ref && is_masked_compound_type(mi->interinter_comp.type)) {
-        // masked compound type has its own average mechanism
-        conv_params.do_average = 0;
-        av1_make_masked_inter_predictor(
-            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
-            bh, &conv_params, mi->interp_filters, plane, &warp_types,
-            mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, ref, xd,
-            cm->allow_warped_motion);
-      } else {
-        conv_params.do_average = ref;
-        av1_make_inter_predictor(
-            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
-            bh, &conv_params, mi->interp_filters, &warp_types,
-            mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, plane, ref,
-            mi, build_for_obmc, xd, cm->allow_warped_motion);
-      }
-    }
-  }
-}
-
-static void build_inter_predictors_for_planes(const AV1_COMMON *cm,
-                                              MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                                              int mi_row, int mi_col,
-                                              int plane_from, int plane_to) {
-  int plane;
-  const int mi_x = mi_col * MI_SIZE;
-  const int mi_y = mi_row * MI_SIZE;
-  for (plane = plane_from; plane <= plane_to; ++plane) {
-    const struct macroblockd_plane *pd = &xd->plane[plane];
-    const int bw = pd->width;
-    const int bh = pd->height;
-
-    if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                             pd->subsampling_y))
-      continue;
-
-    build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y);
-  }
-}
-
-void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                    int mi_row, int mi_col, BUFFER_SET *ctx,
-                                    BLOCK_SIZE bsize) {
-  build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 0, 0);
-
-  if (is_interintra_pred(xd->mi[0])) {
-    BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL },
-                               { xd->plane[0].dst.stride, 0, 0 } };
-    if (!ctx) ctx = &default_ctx;
-    av1_build_interintra_predictors_sbp(cm, xd, xd->plane[0].dst.buf,
-                                        xd->plane[0].dst.stride, ctx, 0, bsize);
-  }
-}
-
-void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col, BUFFER_SET *ctx,
-                                     BLOCK_SIZE bsize) {
-  build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 1,
-                                    MAX_MB_PLANE - 1);
-
-  if (is_interintra_pred(xd->mi[0])) {
-    BUFFER_SET default_ctx = {
-      { NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
-      { 0, xd->plane[1].dst.stride, xd->plane[2].dst.stride }
-    };
-    if (!ctx) ctx = &default_ctx;
-    av1_build_interintra_predictors_sbuv(
-        cm, xd, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
-        xd->plane[1].dst.stride, xd->plane[2].dst.stride, ctx, bsize);
-  }
-}
-
-void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                   int mi_row, int mi_col, BUFFER_SET *ctx,
-                                   BLOCK_SIZE bsize) {
-  const int num_planes = av1_num_planes(cm);
-  av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
-  if (num_planes > 1)
-    av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
-}
-
 void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                           const int plane_start, const int plane_end) {
@@ -1292,63 +934,7 @@ void av1_setup_build_prediction_by_above_pred(
 
   xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
   xd->mb_to_right_edge = ctxt->mb_to_far_edge +
-                         (xd->n8_w - rel_mi_col - above_mi_width) * MI_SIZE * 8;
-}
-
-static INLINE void build_prediction_by_above_pred(
-    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
-    MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
-  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
-  const int above_mi_col = ctxt->mi_col + rel_mi_col;
-  int mi_x, mi_y;
-  MB_MODE_INFO backup_mbmi = *above_mbmi;
-
-  av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width,
-                                           above_mbmi, ctxt, num_planes);
-  mi_x = above_mi_col << MI_SIZE_LOG2;
-  mi_y = ctxt->mi_row << MI_SIZE_LOG2;
-
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-
-  for (int j = 0; j < num_planes; ++j) {
-    const struct macroblockd_plane *pd = &xd->plane[j];
-    int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
-    int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
-                   block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
-
-    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
-    build_inter_predictors(ctxt->cm, xd, j, above_mbmi, 1, bw, bh, mi_x, mi_y);
-  }
-  *above_mbmi = backup_mbmi;
-}
-
-void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         int mi_row, int mi_col,
-                                         uint8_t *tmp_buf[MAX_MB_PLANE],
-                                         int tmp_width[MAX_MB_PLANE],
-                                         int tmp_height[MAX_MB_PLANE],
-                                         int tmp_stride[MAX_MB_PLANE]) {
-  if (!xd->up_available) return;
-
-  // Adjust mb_to_bottom_edge to have the correct value for the OBMC
-  // prediction block. This is half the height of the original block,
-  // except for 128-wide blocks, where we only use a height of 32.
-  int this_height = xd->n8_h * MI_SIZE;
-  int pred_height = AOMMIN(this_height / 2, 32);
-  xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
-
-  struct build_prediction_ctxt ctxt = { cm,         mi_row,
-                                        mi_col,     tmp_buf,
-                                        tmp_width,  tmp_height,
-                                        tmp_stride, xd->mb_to_right_edge };
-  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-  foreach_overlappable_nb_above(cm, xd, mi_col,
-                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
-                                build_prediction_by_above_pred, &ctxt);
-
-  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
-  xd->mb_to_right_edge = ctxt.mb_to_far_edge;
-  xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
+                         (xd->n4_w - rel_mi_col - above_mi_width) * MI_SIZE * 8;
 }
 
 void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
@@ -1386,101 +972,7 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
   xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row);
   xd->mb_to_bottom_edge =
       ctxt->mb_to_far_edge +
-      (xd->n8_h - rel_mi_row - left_mi_height) * MI_SIZE * 8;
-}
-
-static INLINE void build_prediction_by_left_pred(
-    MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height,
-    MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
-  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
-  const int left_mi_row = ctxt->mi_row + rel_mi_row;
-  int mi_x, mi_y;
-  MB_MODE_INFO backup_mbmi = *left_mbmi;
-
-  av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height,
-                                          left_mbmi, ctxt, num_planes);
-  mi_x = ctxt->mi_col << MI_SIZE_LOG2;
-  mi_y = left_mi_row << MI_SIZE_LOG2;
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-
-  for (int j = 0; j < num_planes; ++j) {
-    const struct macroblockd_plane *pd = &xd->plane[j];
-    int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
-                   block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
-    int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y;
-
-    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
-    build_inter_predictors(ctxt->cm, xd, j, left_mbmi, 1, bw, bh, mi_x, mi_y);
-  }
-  *left_mbmi = backup_mbmi;
-}
-
-void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                        int mi_row, int mi_col,
-                                        uint8_t *tmp_buf[MAX_MB_PLANE],
-                                        int tmp_width[MAX_MB_PLANE],
-                                        int tmp_height[MAX_MB_PLANE],
-                                        int tmp_stride[MAX_MB_PLANE]) {
-  if (!xd->left_available) return;
-
-  // Adjust mb_to_right_edge to have the correct value for the OBMC
-  // prediction block. This is half the width of the original block,
-  // except for 128-wide blocks, where we only use a width of 32.
-  int this_width = xd->n8_w * MI_SIZE;
-  int pred_width = AOMMIN(this_width / 2, 32);
-  xd->mb_to_right_edge += (this_width - pred_width) * 8;
-
-  struct build_prediction_ctxt ctxt = { cm,         mi_row,
-                                        mi_col,     tmp_buf,
-                                        tmp_width,  tmp_height,
-                                        tmp_stride, xd->mb_to_bottom_edge };
-  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-  foreach_overlappable_nb_left(cm, xd, mi_row,
-                               max_neighbor_obmc[mi_size_high_log2[bsize]],
-                               build_prediction_by_left_pred, &ctxt);
-
-  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
-  xd->mb_to_right_edge -= (this_width - pred_width) * 8;
-  xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
-}
-
-void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                        int mi_row, int mi_col) {
-  const int num_planes = av1_num_planes(cm);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
-  int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    int len = sizeof(uint16_t);
-    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
-    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
-    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len);
-    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
-    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
-    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
-  } else {
-    dst_buf1[0] = tmp_buf1;
-    dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
-    dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
-    dst_buf2[0] = tmp_buf2;
-    dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
-    dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
-  }
-  av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
-                                      dst_width1, dst_height1, dst_stride1);
-  av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
-                                     dst_width2, dst_height2, dst_stride2);
-  av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, get_frame_new_buffer(cm),
-                       mi_row, mi_col, 0, num_planes);
-  av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
-                                  dst_buf2, dst_stride2);
+      (xd->n4_h - rel_mi_row - left_mi_height) * MI_SIZE * 8;
 }
 
 /* clang-format off */
@@ -1668,127 +1160,3 @@ void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
   av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize);
   av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize);
 }
-
-void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     uint8_t *ypred, uint8_t *upred,
-                                     uint8_t *vpred, int ystride, int ustride,
-                                     int vstride, BUFFER_SET *ctx,
-                                     BLOCK_SIZE bsize) {
-  av1_build_interintra_predictors_sbp(cm, xd, ypred, ystride, ctx, 0, bsize);
-  av1_build_interintra_predictors_sbuv(cm, xd, upred, vpred, ustride, vstride,
-                                       ctx, bsize);
-}
-
-// Builds the inter-predictor for the single ref case
-// for use in the encoder to search the wedges efficiently.
-static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
-                                              int bw, int bh, int x, int y,
-                                              int w, int h, int mi_x, int mi_y,
-                                              int ref, uint8_t *const ext_dst,
-                                              int ext_dst_stride,
-                                              int can_use_previous) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const MB_MODE_INFO *mi = xd->mi[0];
-
-  const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-  struct buf_2d *const pre_buf = &pd->pre[ref];
-  uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x;
-  const MV mv = mi->mv[ref].as_mv;
-
-  ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd);
-  WarpTypesAllowed warp_types;
-  const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
-  warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype);
-  warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-  const int pre_x = (mi_x) >> pd->subsampling_x;
-  const int pre_y = (mi_y) >> pd->subsampling_y;
-  uint8_t *pre;
-  SubpelParams subpel_params;
-  calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
-                     &subpel_params, bw, bh);
-
-  av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride,
-                           &subpel_params, sf, w, h, &conv_params,
-                           mi->interp_filters, &warp_types, pre_x + x,
-                           pre_y + y, plane, ref, mi, 0, xd, can_use_previous);
-}
-
-void av1_build_inter_predictors_for_planes_single_buf(
-    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
-    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
-    int can_use_previous) {
-  int plane;
-  const int mi_x = mi_col * MI_SIZE;
-  const int mi_y = mi_row * MI_SIZE;
-  for (plane = plane_from; plane <= plane_to; ++plane) {
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(
-        bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
-    const int bw = block_size_wide[plane_bsize];
-    const int bh = block_size_high[plane_bsize];
-    build_inter_predictors_single_buf(xd, plane, bw, bh, 0, 0, bw, bh, mi_x,
-                                      mi_y, ref, ext_dst[plane],
-                                      ext_dst_stride[plane], can_use_previous);
-  }
-}
-
-static void build_wedge_inter_predictor_from_buf(
-    MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0,
-    int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) {
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  const int is_compound = has_second_ref(mbmi);
-  MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
-  struct buf_2d *const dst_buf = &pd->dst;
-  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
-  mbmi->interinter_comp.seg_mask = xd->seg_mask;
-  const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
-
-  if (is_compound && is_masked_compound_type(comp_data->type)) {
-    if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-        av1_build_compound_diffwtd_mask_highbd(
-            comp_data->seg_mask, comp_data->mask_type,
-            CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
-            CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
-      else
-        av1_build_compound_diffwtd_mask(
-            comp_data->seg_mask, comp_data->mask_type, ext_dst0,
-            ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
-    }
-
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      build_masked_compound_highbd(
-          dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
-          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
-          mbmi->sb_type, h, w, xd->bd);
-    else
-      build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
-                            ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
-                            h, w);
-  } else {
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
-                               dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
-                               xd->bd);
-    else
-      aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
-                        0, NULL, 0, w, h);
-  }
-}
-
-void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                                              int plane_from, int plane_to,
-                                              uint8_t *ext_dst0[3],
-                                              int ext_dst_stride0[3],
-                                              uint8_t *ext_dst1[3],
-                                              int ext_dst_stride1[3]) {
-  int plane;
-  for (plane = plane_from; plane <= plane_to; ++plane) {
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(
-        bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
-    const int bw = block_size_wide[plane_bsize];
-    const int bh = block_size_high[plane_bsize];
-    build_wedge_inter_predictor_from_buf(
-        xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane],
-        ext_dst1[plane], ext_dst_stride1[plane]);
-  }
-}
diff --git a/third_party/aom/av1/common/reconinter.h b/third_party/aom/av1/common/reconinter.h
index 6a3def270..db86c777e 100644
--- a/third_party/aom/av1/common/reconinter.h
+++ b/third_party/aom/av1/common/reconinter.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_RECONINTER_H_
-#define AV1_COMMON_RECONINTER_H_
+#ifndef AOM_AV1_COMMON_RECONINTER_H_
+#define AOM_AV1_COMMON_RECONINTER_H_
 
 #include "av1/common/filter.h"
 #include "av1/common/onyxc_int.h"
@@ -113,40 +113,48 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride,
                                    const SubpelParams *subpel_params,
                                    const struct scale_factors *sf, int w, int h,
                                    ConvolveParams *conv_params,
-                                   InterpFilters interp_filters) {
+                                   InterpFilters interp_filters,
+                                   int is_intrabc) {
   assert(conv_params->do_average == 0 || conv_params->do_average == 1);
   assert(sf);
-  if (has_scale(subpel_params->xs, subpel_params->ys)) {
+  const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
+  assert(IMPLIES(is_intrabc, !is_scaled));
+  if (is_scaled) {
     av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
                            interp_filters, subpel_params->subpel_x,
                            subpel_params->xs, subpel_params->subpel_y,
-                           subpel_params->ys, 1, conv_params, sf);
+                           subpel_params->ys, 1, conv_params, sf, is_intrabc);
   } else {
     SubpelParams sp = *subpel_params;
     revert_scale_extra_bits(&sp);
     av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
                            interp_filters, sp.subpel_x, sp.xs, sp.subpel_y,
-                           sp.ys, 0, conv_params, sf);
+                           sp.ys, 0, conv_params, sf, is_intrabc);
   }
 }
 
-static INLINE void highbd_inter_predictor(
-    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
-    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
-    int h, ConvolveParams *conv_params, InterpFilters interp_filters, int bd) {
+static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
+                                          uint8_t *dst, int dst_stride,
+                                          const SubpelParams *subpel_params,
+                                          const struct scale_factors *sf, int w,
+                                          int h, ConvolveParams *conv_params,
+                                          InterpFilters interp_filters,
+                                          int is_intrabc, int bd) {
   assert(conv_params->do_average == 0 || conv_params->do_average == 1);
   assert(sf);
-  if (has_scale(subpel_params->xs, subpel_params->ys)) {
-    av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
-                                  interp_filters, subpel_params->subpel_x,
-                                  subpel_params->xs, subpel_params->subpel_y,
-                                  subpel_params->ys, 1, conv_params, sf, bd);
+  const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
+  assert(IMPLIES(is_intrabc, !is_scaled));
+  if (is_scaled) {
+    av1_highbd_convolve_2d_facade(
+        src, src_stride, dst, dst_stride, w, h, interp_filters,
+        subpel_params->subpel_x, subpel_params->xs, subpel_params->subpel_y,
+        subpel_params->ys, 1, conv_params, sf, is_intrabc, bd);
   } else {
     SubpelParams sp = *subpel_params;
     revert_scale_extra_bits(&sp);
-    av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
-                                  interp_filters, sp.subpel_x, sp.xs,
-                                  sp.subpel_y, sp.ys, 0, conv_params, sf, bd);
+    av1_highbd_convolve_2d_facade(
+        src, src_stride, dst, dst_stride, w, h, interp_filters, sp.subpel_x,
+        sp.xs, sp.subpel_y, sp.ys, 0, conv_params, sf, is_intrabc, bd);
   }
 }
 
@@ -237,35 +245,6 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
   return clamped_mv;
 }
 
-void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                    int mi_row, int mi_col, BUFFER_SET *ctx,
-                                    BLOCK_SIZE bsize);
-
-void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col, BUFFER_SET *ctx,
-                                     BLOCK_SIZE bsize);
-
-void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                   int mi_row, int mi_col, BUFFER_SET *ctx,
-                                   BLOCK_SIZE bsize);
-
-void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
-                               int dst_stride, const MV *src_mv,
-                               const struct scale_factors *sf, int w, int h,
-                               ConvolveParams *conv_params,
-                               InterpFilters interp_filters,
-                               const WarpTypesAllowed *warp_types, int p_col,
-                               int p_row, int plane, int ref,
-                               enum mv_precision precision, int x, int y,
-                               const MACROBLOCKD *xd, int can_use_previous);
-
-void av1_highbd_build_inter_predictor(
-    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
-    const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg,
-    InterpFilters interp_filters, const WarpTypesAllowed *warp_types, int p_col,
-    int p_row, int plane, enum mv_precision precision, int x, int y,
-    const MACROBLOCKD *xd, int can_use_previous);
-
 static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
                                        const struct scale_factors *sf) {
   const int x =
@@ -303,32 +282,6 @@ void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                           const struct scale_factors *sf, const int num_planes);
 
-// Detect if the block have sub-pixel level motion vectors
-// per component.
-#define CHECK_SUBPEL 0
-static INLINE int has_subpel_mv_component(const MB_MODE_INFO *const mbmi,
-                                          const MACROBLOCKD *const xd,
-                                          int dir) {
-#if CHECK_SUBPEL
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  int plane;
-  int ref = (dir >> 1);
-
-  if (dir & 0x01) {
-    if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK) return 1;
-  } else {
-    if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1;
-  }
-
-  return 0;
-#else
-  (void)mbmi;
-  (void)xd;
-  (void)dir;
-  return 1;
-#endif
-}
-
 static INLINE void set_default_interp_filters(
     MB_MODE_INFO *const mbmi, InterpFilter frame_interp_filter) {
   mbmi->interp_filters =
@@ -343,21 +296,6 @@ static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) {
   return 1;
 }
 
-static INLINE int av1_is_interp_search_needed(const MACROBLOCKD *const xd) {
-  MB_MODE_INFO *const mi = xd->mi[0];
-  const int is_compound = has_second_ref(mi);
-  int ref;
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    int row_col;
-    for (row_col = 0; row_col < 2; ++row_col) {
-      const int dir = (ref << 1) + row_col;
-      if (has_subpel_mv_component(mi, xd, dir)) {
-        return 1;
-      }
-    }
-  }
-  return 0;
-}
 void av1_setup_build_prediction_by_above_pred(
     MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
     MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
@@ -367,18 +305,6 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
                                              MB_MODE_INFO *left_mbmi,
                                              struct build_prediction_ctxt *ctxt,
                                              const int num_planes);
-void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         int mi_row, int mi_col,
-                                         uint8_t *tmp_buf[MAX_MB_PLANE],
-                                         int tmp_width[MAX_MB_PLANE],
-                                         int tmp_height[MAX_MB_PLANE],
-                                         int tmp_stride[MAX_MB_PLANE]);
-void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                        int mi_row, int mi_col,
-                                        uint8_t *tmp_buf[MAX_MB_PLANE],
-                                        int tmp_width[MAX_MB_PLANE],
-                                        int tmp_height[MAX_MB_PLANE],
-                                        int tmp_stride[MAX_MB_PLANE]);
 void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                      int mi_row, int mi_col,
                                      uint8_t *above[MAX_MB_PLANE],
@@ -389,8 +315,6 @@ void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
 const uint8_t *av1_get_obmc_mask(int length);
 void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                       int mi_row, int mi_col);
-void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                        int mi_row, int mi_col);
 
 #define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
 #define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
@@ -406,12 +330,6 @@ static INLINE const uint8_t *av1_get_contiguous_soft_mask(int wedge_index,
 const uint8_t *av1_get_compound_type_mask(
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type);
 
-void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     uint8_t *ypred, uint8_t *upred,
-                                     uint8_t *vpred, int ystride, int ustride,
-                                     int vstride, BUFFER_SET *ctx,
-                                     BLOCK_SIZE bsize);
-
 // build interintra_predictors for one plane
 void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          uint8_t *pred, int stride,
@@ -431,18 +349,6 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
                             const uint8_t *inter_pred, int inter_stride,
                             const uint8_t *intra_pred, int intra_stride);
 
-// Encoder only
-void av1_build_inter_predictors_for_planes_single_buf(
-    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
-    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
-    int can_use_previous);
-void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                                              int plane_from, int plane_to,
-                                              uint8_t *ext_dst0[3],
-                                              int ext_dst_stride0[3],
-                                              uint8_t *ext_dst1[3],
-                                              int ext_dst_stride1[3]);
-
 void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
                                 int order_idx, int *fwd_offset, int *bck_offset,
                                 int *use_jnt_comp_avg, int is_compound);
@@ -456,4 +362,4 @@ int av1_allow_warp(const MB_MODE_INFO *const mbmi,
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_RECONINTER_H_
+#endif  // AOM_AV1_COMMON_RECONINTER_H_
diff --git a/third_party/aom/av1/common/reconintra.h b/third_party/aom/av1/common/reconintra.h
index 57638f24e..07853aba0 100644
--- a/third_party/aom/av1/common/reconintra.h
+++ b/third_party/aom/av1/common/reconintra.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_RECONINTRA_H_
-#define AV1_COMMON_RECONINTRA_H_
+#ifndef AOM_AV1_COMMON_RECONINTRA_H_
+#define AOM_AV1_COMMON_RECONINTRA_H_
 
 #include <stdlib.h>
 
@@ -116,4 +116,4 @@ static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta,
 #ifdef __cplusplus
 }  // extern "C"
 #endif
-#endif  // AV1_COMMON_RECONINTRA_H_
+#endif  // AOM_AV1_COMMON_RECONINTRA_H_
diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c
index 93d62292a..d61a20aa2 100644
--- a/third_party/aom/av1/common/resize.c
+++ b/third_party/aom/av1/common/resize.c
@@ -170,42 +170,6 @@ static const InterpKernel filteredinterp_filters875[(1 << RS_SUBPEL_BITS)] = {
   { -1, 3, -9, 17, 112, 10, -7, 3 },  { -1, 3, -8, 15, 112, 12, -7, 2 },
 };
 
-// Filters for interpolation (full-band) - no filtering for integer pixels
-static const InterpKernel filteredinterp_filters1000[(1 << RS_SUBPEL_BITS)] = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },        { 0, 0, -1, 128, 2, -1, 0, 0 },
-  { 0, 1, -3, 127, 4, -2, 1, 0 },      { 0, 1, -4, 127, 6, -3, 1, 0 },
-  { 0, 2, -6, 126, 8, -3, 1, 0 },      { 0, 2, -7, 125, 11, -4, 1, 0 },
-  { -1, 2, -8, 125, 13, -5, 2, 0 },    { -1, 3, -9, 124, 15, -6, 2, 0 },
-  { -1, 3, -10, 123, 18, -6, 2, -1 },  { -1, 3, -11, 122, 20, -7, 3, -1 },
-  { -1, 4, -12, 121, 22, -8, 3, -1 },  { -1, 4, -13, 120, 25, -9, 3, -1 },
-  { -1, 4, -14, 118, 28, -9, 3, -1 },  { -1, 4, -15, 117, 30, -10, 4, -1 },
-  { -1, 5, -16, 116, 32, -11, 4, -1 }, { -1, 5, -16, 114, 35, -12, 4, -1 },
-  { -1, 5, -17, 112, 38, -12, 4, -1 }, { -1, 5, -18, 111, 40, -13, 5, -1 },
-  { -1, 5, -18, 109, 43, -14, 5, -1 }, { -1, 6, -19, 107, 45, -14, 5, -1 },
-  { -1, 6, -19, 105, 48, -15, 5, -1 }, { -1, 6, -19, 103, 51, -16, 5, -1 },
-  { -1, 6, -20, 101, 53, -16, 6, -1 }, { -1, 6, -20, 99, 56, -17, 6, -1 },
-  { -1, 6, -20, 97, 58, -17, 6, -1 },  { -1, 6, -20, 95, 61, -18, 6, -1 },
-  { -2, 7, -20, 93, 64, -18, 6, -2 },  { -2, 7, -20, 91, 66, -19, 6, -1 },
-  { -2, 7, -20, 88, 69, -19, 6, -1 },  { -2, 7, -20, 86, 71, -19, 6, -1 },
-  { -2, 7, -20, 84, 74, -20, 7, -2 },  { -2, 7, -20, 81, 76, -20, 7, -1 },
-  { -2, 7, -20, 79, 79, -20, 7, -2 },  { -1, 7, -20, 76, 81, -20, 7, -2 },
-  { -2, 7, -20, 74, 84, -20, 7, -2 },  { -1, 6, -19, 71, 86, -20, 7, -2 },
-  { -1, 6, -19, 69, 88, -20, 7, -2 },  { -1, 6, -19, 66, 91, -20, 7, -2 },
-  { -2, 6, -18, 64, 93, -20, 7, -2 },  { -1, 6, -18, 61, 95, -20, 6, -1 },
-  { -1, 6, -17, 58, 97, -20, 6, -1 },  { -1, 6, -17, 56, 99, -20, 6, -1 },
-  { -1, 6, -16, 53, 101, -20, 6, -1 }, { -1, 5, -16, 51, 103, -19, 6, -1 },
-  { -1, 5, -15, 48, 105, -19, 6, -1 }, { -1, 5, -14, 45, 107, -19, 6, -1 },
-  { -1, 5, -14, 43, 109, -18, 5, -1 }, { -1, 5, -13, 40, 111, -18, 5, -1 },
-  { -1, 4, -12, 38, 112, -17, 5, -1 }, { -1, 4, -12, 35, 114, -16, 5, -1 },
-  { -1, 4, -11, 32, 116, -16, 5, -1 }, { -1, 4, -10, 30, 117, -15, 4, -1 },
-  { -1, 3, -9, 28, 118, -14, 4, -1 },  { -1, 3, -9, 25, 120, -13, 4, -1 },
-  { -1, 3, -8, 22, 121, -12, 4, -1 },  { -1, 3, -7, 20, 122, -11, 3, -1 },
-  { -1, 2, -6, 18, 123, -10, 3, -1 },  { 0, 2, -6, 15, 124, -9, 3, -1 },
-  { 0, 2, -5, 13, 125, -8, 2, -1 },    { 0, 1, -4, 11, 125, -7, 2, 0 },
-  { 0, 1, -3, 8, 126, -6, 2, 0 },      { 0, 1, -3, 6, 127, -4, 1, 0 },
-  { 0, 1, -2, 4, 127, -3, 1, 0 },      { 0, 0, -1, 2, 128, -1, 0, 0 },
-};
-
 const int16_t av1_resize_filter_normative[(
     1 << RS_SUBPEL_BITS)][UPSCALE_NORMATIVE_TAPS] = {
 #if UPSCALE_NORMATIVE_TAPS == 8
@@ -246,6 +210,9 @@ const int16_t av1_resize_filter_normative[(
 #endif  // UPSCALE_NORMATIVE_TAPS == 8
 };
 
+// Filters for interpolation (full-band) - no filtering for integer pixels
+#define filteredinterp_filters1000 av1_resize_filter_normative
+
 // Filters for factor of 2 downsampling.
 static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 };
 static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 };
diff --git a/third_party/aom/av1/common/resize.h b/third_party/aom/av1/common/resize.h
index feec3a90e..9a59a8d63 100644
--- a/third_party/aom/av1/common/resize.h
+++ b/third_party/aom/av1/common/resize.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_RESIZE_H_
-#define AV1_ENCODER_RESIZE_H_
+#ifndef AOM_AV1_COMMON_RESIZE_H_
+#define AOM_AV1_COMMON_RESIZE_H_
 
 #include <stdio.h>
 #include "aom/aom_integer.h"
@@ -109,4 +109,4 @@ int32_t av1_get_upscale_convolve_step(int in_length, int out_length);
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_RESIZE_H_
+#endif  // AOM_AV1_COMMON_RESIZE_H_
diff --git a/third_party/aom/av1/common/restoration.c b/third_party/aom/av1/common/restoration.c
index 632967957..d276a915b 100644
--- a/third_party/aom/av1/common/restoration.c
+++ b/third_party/aom/av1/common/restoration.c
@@ -661,9 +661,10 @@ const int32_t one_by_x[MAX_NELEM] = {
   293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
 };
 
-static void selfguided_restoration_fast_internal(
-    int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
-    int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
+static void calculate_intermediate_result(int32_t *dgd, int width, int height,
+                                          int dgd_stride, int bit_depth,
+                                          int sgr_params_idx, int radius_idx,
+                                          int pass, int32_t *A, int32_t *B) {
   const sgr_params_type *const params = &sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
@@ -673,10 +674,7 @@ static void selfguided_restoration_fast_internal(
   // We also align the stride to a multiple of 16 bytes, for consistency
   // with the SIMD version of this function.
   int buf_stride = ((width_ext + 3) & ~3) + 16;
-  int32_t A_[RESTORATION_PROC_UNIT_PELS];
-  int32_t B_[RESTORATION_PROC_UNIT_PELS];
-  int32_t *A = A_;
-  int32_t *B = B_;
+  const int step = pass == 0 ? 1 : 2;
   int i, j;
 
   assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
@@ -691,7 +689,7 @@ static void selfguided_restoration_fast_internal(
   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
   // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
   // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
-  for (i = -1; i < height + 1; i += 2) {
+  for (i = -1; i < height + 1; i += step) {
     for (j = -1; j < width + 1; ++j) {
       const int k = i * buf_stride + j;
       const int n = (2 * r + 1) * (2 * r + 1);
@@ -754,7 +752,31 @@ static void selfguided_restoration_fast_internal(
                                          SGRPROJ_RECIP_BITS);
     }
   }
+}
+
+static void selfguided_restoration_fast_internal(
+    int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
+    int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 16 bytes, for consistency
+  // with the SIMD version of this function.
+  int buf_stride = ((width_ext + 3) & ~3) + 16;
+  int32_t A_[RESTORATION_PROC_UNIT_PELS];
+  int32_t B_[RESTORATION_PROC_UNIT_PELS];
+  int32_t *A = A_;
+  int32_t *B = B_;
+  int i, j;
+  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
+                                sgr_params_idx, radius_idx, 1, A, B);
+  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
   // Use the A[] and B[] arrays to calculate the filtered image
+  (void)r;
   assert(r == 2);
   for (i = 0; i < height; ++i) {
     if (!(i & 1)) {  // even row
@@ -796,10 +818,7 @@ static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
                                             int dst_stride, int bit_depth,
                                             int sgr_params_idx,
                                             int radius_idx) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
-  const int r = params->r[radius_idx];
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
-  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
   // Adjusting the stride of A and B here appears to avoid bad cache effects,
   // leading to a significant speed improvement.
   // We also align the stride to a multiple of 16 bytes, for consistency
@@ -810,82 +829,11 @@ static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
   int32_t *A = A_;
   int32_t *B = B_;
   int i, j;
-
-  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
-  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
-         "Need SGRPROJ_BORDER_* >= r+1");
-
-  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
-         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
-  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
-         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
+  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
+                                sgr_params_idx, radius_idx, 0, A, B);
   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
-  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
-  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
-  for (i = -1; i < height + 1; ++i) {
-    for (j = -1; j < width + 1; ++j) {
-      const int k = i * buf_stride + j;
-      const int n = (2 * r + 1) * (2 * r + 1);
-
-      // a < 2^16 * n < 2^22 regardless of bit depth
-      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
-      // b < 2^8 * n < 2^14 regardless of bit depth
-      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
-
-      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
-      // and p itself satisfies p < 2^14 * n^2 < 2^26.
-      // This bound on p is due to:
-      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
-      //
-      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
-      // This is an artefact of rounding, and can only happen if all pixels
-      // are (almost) identical, so in this case we saturate to p=0.
-      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
-
-      const uint32_t s = params->s[radius_idx];
-
-      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
-      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
-      // (this holds even after accounting for the rounding in s)
-      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
-
-      // Note: We have to be quite careful about the value of A[k].
-      // This is used as a blend factor between individual pixel values and the
-      // local mean. So it logically has a range of [0, 256], including both
-      // endpoints.
-      //
-      // This is a pain for hardware, as we'd like something which can be stored
-      // in exactly 8 bits.
-      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
-      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
-      // slightly above 2^(8 + bit depth), due to rounding in the value of
-      // one_by_x[25-1].
-      //
-      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
-      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
-      // overflow), without significantly affecting the final result: z == 0
-      // implies that the image is essentially "flat", so the local mean and
-      // individual pixel values are very similar.
-      //
-      // Note that saturating on the other side, ie. requring A[k] <= 255,
-      // would be a bad idea, as that corresponds to the case where the image
-      // is very variable, when we want to preserve the local pixel value as
-      // much as possible.
-      A[k] = x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
 
-      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
-      // one_by_x[n - 1] = round(2^12 / n)
-      // => the product here is < 2^(20 + bit_depth) <= 2^32,
-      // and B[k] is set to a value < 2^(8 + bit depth)
-      // This holds even with the rounding in one_by_x and in the overall
-      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
-      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
-                                             (uint32_t)B[k] *
-                                             (uint32_t)one_by_x[n - 1],
-                                         SGRPROJ_RECIP_BITS);
-    }
-  }
   // Use the A[] and B[] arrays to calculate the filtered image
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
@@ -911,10 +859,10 @@ static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
   }
 }
 
-void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
-                                  int dgd_stride, int32_t *flt0, int32_t *flt1,
-                                  int flt_stride, int sgr_params_idx,
-                                  int bit_depth, int highbd) {
+int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1,
+                                 int flt_stride, int sgr_params_idx,
+                                 int bit_depth, int highbd) {
   int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
   const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
   int32_t *dgd32 =
@@ -948,6 +896,7 @@ void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
   if (params->r[1] > 0)
     selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
                                     flt_stride, bit_depth, sgr_params_idx, 1);
+  return 0;
 }
 
 void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
@@ -959,8 +908,10 @@ void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
 
-  av1_selfguided_restoration_c(dat8, width, height, stride, flt0, flt1, width,
-                               eps, bit_depth, highbd);
+  const int ret = av1_selfguided_restoration_c(
+      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+  (void)ret;
+  assert(!ret);
   const sgr_params_type *const params = &sgr_params[eps];
   int xq[2];
   decode_xq(xqd, xq, params);
diff --git a/third_party/aom/av1/common/restoration.h b/third_party/aom/av1/common/restoration.h
index aec37d834..d834f9270 100644
--- a/third_party/aom/av1/common/restoration.h
+++ b/third_party/aom/av1/common/restoration.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_RESTORATION_H_
-#define AV1_COMMON_RESTORATION_H_
+#ifndef AOM_AV1_COMMON_RESTORATION_H_
+#define AOM_AV1_COMMON_RESTORATION_H_
 
 #include "aom_ports/mem.h"
 #include "config/aom_config.h"
@@ -120,6 +120,7 @@ extern "C" {
 // If WIENER_WIN_CHROMA == WIENER_WIN - 2, that implies 5x5 filters are used for
 // chroma. To use 7x7 for chroma set WIENER_WIN_CHROMA to WIENER_WIN.
 #define WIENER_WIN_CHROMA (WIENER_WIN - 2)
+#define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA))
 
 #define WIENER_FILT_PREC_BITS 7
 #define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS)
@@ -373,4 +374,4 @@ void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_RESTORATION_H_
+#endif  // AOM_AV1_COMMON_RESTORATION_H_
diff --git a/third_party/aom/av1/common/scale.h b/third_party/aom/av1/common/scale.h
index 5f02fdb81..748e958c3 100644
--- a/third_party/aom/av1/common/scale.h
+++ b/third_party/aom/av1/common/scale.h
@@ -9,12 +9,11 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_SCALE_H_
-#define AV1_COMMON_SCALE_H_
+#ifndef AOM_AV1_COMMON_SCALE_H_
+#define AOM_AV1_COMMON_SCALE_H_
 
 #include "av1/common/convolve.h"
 #include "av1/common/mv.h"
-#include "aom_dsp/aom_convolve.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -65,4 +64,4 @@ static INLINE int valid_ref_frame_size(int ref_width, int ref_height,
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_SCALE_H_
+#endif  // AOM_AV1_COMMON_SCALE_H_
diff --git a/third_party/aom/av1/common/scan.h b/third_party/aom/av1/common/scan.h
index d206586b5..233dc0efa 100644
--- a/third_party/aom/av1/common/scan.h
+++ b/third_party/aom/av1/common/scan.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_SCAN_H_
-#define AV1_COMMON_SCAN_H_
+#ifndef AOM_AV1_COMMON_SCAN_H_
+#define AOM_AV1_COMMON_SCAN_H_
 
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
@@ -52,4 +52,4 @@ static INLINE const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) {
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_SCAN_H_
+#endif  // AOM_AV1_COMMON_SCAN_H_
diff --git a/third_party/aom/av1/common/seg_common.h b/third_party/aom/av1/common/seg_common.h
index c851d65fd..8c35bba86 100644
--- a/third_party/aom/av1/common/seg_common.h
+++ b/third_party/aom/av1/common/seg_common.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_SEG_COMMON_H_
-#define AV1_COMMON_SEG_COMMON_H_
+#ifndef AOM_AV1_COMMON_SEG_COMMON_H_
+#define AOM_AV1_COMMON_SEG_COMMON_H_
 
 #include "aom_dsp/prob.h"
 
@@ -101,4 +101,4 @@ static INLINE int get_segdata(const struct segmentation *seg, int segment_id,
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_SEG_COMMON_H_
+#endif  // AOM_AV1_COMMON_SEG_COMMON_H_
diff --git a/third_party/aom/av1/common/thread_common.c b/third_party/aom/av1/common/thread_common.c
index f9b734b8c..8df4c9a09 100644
--- a/third_party/aom/av1/common/thread_common.c
+++ b/third_party/aom/av1/common/thread_common.c
@@ -304,8 +304,9 @@ static INLINE void thread_loop_filter_rows(
 }
 
 // Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(AV1LfSync *const lf_sync,
-                                  LFWorkerData *const lf_data) {
+static int loop_filter_row_worker(void *arg1, void *arg2) {
+  AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
+  LFWorkerData *const lf_data = (LFWorkerData *)arg2;
   thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
                           lf_data->xd, lf_sync);
   return 1;
@@ -342,7 +343,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
     AVxWorker *const worker = &workers[i];
     LFWorkerData *const lf_data = &lf_sync->lfdata[i];
 
-    worker->hook = (AVxWorkerHook)loop_filter_row_worker;
+    worker->hook = loop_filter_row_worker;
     worker->data1 = lf_sync;
     worker->data2 = lf_data;
 
@@ -649,8 +650,9 @@ AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) {
 }
 
 // Implement row loop restoration for each thread.
-static int loop_restoration_row_worker(AV1LrSync *const lr_sync,
-                                       LRWorkerData *lrworkerdata) {
+static int loop_restoration_row_worker(void *arg1, void *arg2) {
+  AV1LrSync *const lr_sync = (AV1LrSync *)arg1;
+  LRWorkerData *lrworkerdata = (LRWorkerData *)arg2;
   AV1LrStruct *lr_ctxt = (AV1LrStruct *)lrworkerdata->lr_ctxt;
   FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
   int lr_unit_row;
@@ -714,10 +716,12 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
   int num_rows_lr = 0;
 
   for (int plane = 0; plane < num_planes; plane++) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+
     const AV1PixelRect tile_rect = ctxt[plane].tile_rect;
     const int max_tile_h = tile_rect.bottom - tile_rect.top;
 
-    const int unit_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+    const int unit_size = cm->rst_info[plane].restoration_unit_size;
 
     num_rows_lr =
         AOMMAX(num_rows_lr, av1_lr_count_units_in_tile(unit_size, max_tile_h));
@@ -746,7 +750,7 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
   for (i = 0; i < num_workers; ++i) {
     AVxWorker *const worker = &workers[i];
     lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt;
-    worker->hook = (AVxWorkerHook)loop_restoration_row_worker;
+    worker->hook = loop_restoration_row_worker;
     worker->data1 = lr_sync;
     worker->data2 = &lr_sync->lrworkerdata[i];
 
diff --git a/third_party/aom/av1/common/thread_common.h b/third_party/aom/av1/common/thread_common.h
index 4b0d5d2b8..23d61d72a 100644
--- a/third_party/aom/av1/common/thread_common.h
+++ b/third_party/aom/av1/common/thread_common.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_LOOPFILTER_THREAD_H_
-#define AV1_COMMON_LOOPFILTER_THREAD_H_
+#ifndef AOM_AV1_COMMON_THREAD_COMMON_H_
+#define AOM_AV1_COMMON_THREAD_COMMON_H_
 
 #include "config/aom_config.h"
 
@@ -116,4 +116,4 @@ void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers);
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_LOOPFILTER_THREAD_H_
+#endif  // AOM_AV1_COMMON_THREAD_COMMON_H_
diff --git a/third_party/aom/av1/common/tile_common.c b/third_party/aom/av1/common/tile_common.c
index 026c904b6..1b413487f 100644
--- a/third_party/aom/av1/common/tile_common.c
+++ b/third_party/aom/av1/common/tile_common.c
@@ -127,6 +127,22 @@ void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
   assert(tile->mi_col_end > tile->mi_col_start);
 }
 
+int av1_get_sb_rows_in_tile(AV1_COMMON *cm, TileInfo tile) {
+  int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO(
+      tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2);
+  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+  return sb_rows;
+}
+
+int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) {
+  int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO(
+      tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2);
+  int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+  return sb_cols;
+}
+
 int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles) {
   // Round the frame up to a whole number of max superblocks
   mi_frame_size = ALIGN_POWER_OF_TWO(mi_frame_size, MAX_MIB_SIZE_LOG2);
diff --git a/third_party/aom/av1/common/tile_common.h b/third_party/aom/av1/common/tile_common.h
index be037fb17..c03553dc6 100644
--- a/third_party/aom/av1/common/tile_common.h
+++ b/third_party/aom/av1/common/tile_common.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_TILE_COMMON_H_
-#define AV1_COMMON_TILE_COMMON_H_
+#ifndef AOM_AV1_COMMON_TILE_COMMON_H_
+#define AOM_AV1_COMMON_TILE_COMMON_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -44,6 +44,9 @@ void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
 // tiles horizontally or vertically in the frame.
 int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles);
 
+int av1_get_sb_rows_in_tile(struct AV1Common *cm, TileInfo tile);
+int av1_get_sb_cols_in_tile(struct AV1Common *cm, TileInfo tile);
+
 typedef struct {
   int left, top, right, bottom;
 } AV1PixelRect;
@@ -66,4 +69,4 @@ void av1_calculate_tile_rows(struct AV1Common *const cm);
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_TILE_COMMON_H_
+#endif  // AOM_AV1_COMMON_TILE_COMMON_H_
diff --git a/third_party/aom/av1/common/timing.h b/third_party/aom/av1/common/timing.h
index 1749baa57..06939ae43 100644
--- a/third_party/aom/av1/common/timing.h
+++ b/third_party/aom/av1/common/timing.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_TIMING_H_
-#define AOM_TIMING_H_
+#ifndef AOM_AV1_COMMON_TIMING_H_
+#define AOM_AV1_COMMON_TIMING_H_
 
 #include "aom/aom_integer.h"
 #include "av1/common/enums.h"
@@ -56,4 +56,4 @@ void set_resource_availability_parameters(
 int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
                           int seq_tier);
 
-#endif  // AOM_TIMING_H_
+#endif  // AOM_AV1_COMMON_TIMING_H_
diff --git a/third_party/aom/av1/common/token_cdfs.h b/third_party/aom/av1/common/token_cdfs.h
index 9a6b454ac..53e956450 100644
--- a/third_party/aom/av1/common/token_cdfs.h
+++ b/third_party/aom/av1/common/token_cdfs.h
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#ifndef AOM_AV1_COMMON_TOKEN_CDFS_H_
+#define AOM_AV1_COMMON_TOKEN_CDFS_H_
+
 #include "config/aom_config.h"
 
 #include "av1/common/entropy.h"
@@ -3548,3 +3551,5 @@ static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs
                                         { AOM_CDF3(10923, 21845) },
                                         { AOM_CDF3(10923, 21845) },
                                         { AOM_CDF3(10923, 21845) } } } } };
+
+#endif  // AOM_AV1_COMMON_TOKEN_CDFS_H_
diff --git a/third_party/aom/av1/common/txb_common.h b/third_party/aom/av1/common/txb_common.h
index f0ab79d0f..1dda51f8b 100644
--- a/third_party/aom/av1/common/txb_common.h
+++ b/third_party/aom/av1/common/txb_common.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_TXB_COMMON_H_
-#define AV1_COMMON_TXB_COMMON_H_
+#ifndef AOM_AV1_COMMON_TXB_COMMON_H_
+#define AOM_AV1_COMMON_TXB_COMMON_H_
 
 extern const int16_t k_eob_group_start[12];
 extern const int16_t k_eob_offset_bits[12];
@@ -34,24 +34,6 @@ static const int base_level_count_to_index[13] = {
   0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
 };
 
-// Note: TX_PAD_2D is dependent to this offset table.
-static const int base_ref_offset[BASE_CONTEXT_POSITION_NUM][2] = {
-  /* clang-format off*/
-  { -2, 0 }, { -1, -1 }, { -1, 0 }, { -1, 1 }, { 0, -2 }, { 0, -1 }, { 0, 1 },
-  { 0, 2 },  { 1, -1 },  { 1, 0 },  { 1, 1 },  { 2, 0 }
-  /* clang-format on*/
-};
-
-#define CONTEXT_MAG_POSITION_NUM 3
-static const int mag_ref_offset_with_txclass[3][CONTEXT_MAG_POSITION_NUM][2] = {
-  { { 0, 1 }, { 1, 0 }, { 1, 1 } },
-  { { 0, 1 }, { 1, 0 }, { 0, 2 } },
-  { { 0, 1 }, { 1, 0 }, { 2, 0 } }
-};
-static const int mag_ref_offset[CONTEXT_MAG_POSITION_NUM][2] = {
-  { 0, 1 }, { 1, 0 }, { 1, 1 }
-};
-
 static const TX_CLASS tx_type_to_class[TX_TYPES] = {
   TX_CLASS_2D,     // DCT_DCT
   TX_CLASS_2D,     // ADST_DCT
@@ -71,61 +53,6 @@ static const TX_CLASS tx_type_to_class[TX_TYPES] = {
   TX_CLASS_HORIZ,  // H_FLIPADST
 };
 
-static const int8_t eob_to_pos_small[33] = {
-  0, 1, 2,                                        // 0-2
-  3, 3,                                           // 3-4
-  4, 4, 4, 4,                                     // 5-8
-  5, 5, 5, 5, 5, 5, 5, 5,                         // 9-16
-  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6  // 17-32
-};
-
-static const int8_t eob_to_pos_large[17] = {
-  6,                               // place holder
-  7,                               // 33-64
-  8,  8,                           // 65-128
-  9,  9,  9,  9,                   // 129-256
-  10, 10, 10, 10, 10, 10, 10, 10,  // 257-512
-  11                               // 513-
-};
-
-static INLINE int get_eob_pos_token(const int eob, int *const extra) {
-  int t;
-
-  if (eob < 33) {
-    t = eob_to_pos_small[eob];
-  } else {
-    const int e = AOMMIN((eob - 1) >> 5, 16);
-    t = eob_to_pos_large[e];
-  }
-
-  *extra = eob - k_eob_group_start[t];
-
-  return t;
-}
-
-static INLINE int av1_get_eob_pos_ctx(const TX_TYPE tx_type,
-                                      const int eob_token) {
-  static const int8_t tx_type_to_offset[TX_TYPES] = {
-    -1,  // DCT_DCT
-    -1,  // ADST_DCT
-    -1,  // DCT_ADST
-    -1,  // ADST_ADST
-    -1,  // FLIPADST_DCT
-    -1,  // DCT_FLIPADST
-    -1,  // FLIPADST_FLIPADST
-    -1,  // ADST_FLIPADST
-    -1,  // FLIPADST_ADST
-    -1,  // IDTX
-    10,  // V_DCT
-    10,  // H_DCT
-    10,  // V_ADST
-    10,  // H_ADST
-    10,  // V_FLIPADST
-    10,  // H_FLIPADST
-  };
-  return eob_token + tx_type_to_offset[tx_type];
-}
-
 static INLINE int get_txb_bwl(TX_SIZE tx_size) {
   tx_size = av1_get_adjusted_tx_size(tx_size);
   return tx_size_wide_log2[tx_size];
@@ -141,36 +68,6 @@ static INLINE int get_txb_high(TX_SIZE tx_size) {
   return tx_size_high[tx_size];
 }
 
-static INLINE void get_base_count_mag(int *mag, int *count,
-                                      const tran_low_t *tcoeffs, int bwl,
-                                      int height, int row, int col) {
-  mag[0] = 0;
-  mag[1] = 0;
-  for (int i = 0; i < NUM_BASE_LEVELS; ++i) count[i] = 0;
-  for (int idx = 0; idx < BASE_CONTEXT_POSITION_NUM; ++idx) {
-    const int ref_row = row + base_ref_offset[idx][0];
-    const int ref_col = col + base_ref_offset[idx][1];
-    if (ref_row < 0 || ref_col < 0 || ref_row >= height ||
-        ref_col >= (1 << bwl))
-      continue;
-    const int pos = (ref_row << bwl) + ref_col;
-    tran_low_t abs_coeff = abs(tcoeffs[pos]);
-    // count
-    for (int i = 0; i < NUM_BASE_LEVELS; ++i) {
-      count[i] += abs_coeff > i;
-    }
-    // mag
-    if (base_ref_offset[idx][0] >= 0 && base_ref_offset[idx][1] >= 0) {
-      if (abs_coeff > mag[0]) {
-        mag[0] = abs_coeff;
-        mag[1] = 1;
-      } else if (abs_coeff == mag[0]) {
-        ++mag[1];
-      }
-    }
-  }
-}
-
 static INLINE uint8_t *set_levels(uint8_t *const levels_buf, const int width) {
   return levels_buf + TX_PAD_TOP * (width + TX_PAD_HOR);
 }
@@ -179,30 +76,6 @@ static INLINE int get_padded_idx(const int idx, const int bwl) {
   return idx + ((idx >> bwl) << TX_PAD_HOR_LOG2);
 }
 
-static INLINE int get_level_count(const uint8_t *const levels, const int stride,
-                                  const int row, const int col, const int level,
-                                  const int (*nb_offset)[2], const int nb_num) {
-  int count = 0;
-
-  for (int idx = 0; idx < nb_num; ++idx) {
-    const int ref_row = row + nb_offset[idx][0];
-    const int ref_col = col + nb_offset[idx][1];
-    const int pos = ref_row * stride + ref_col;
-    count += levels[pos] > level;
-  }
-  return count;
-}
-
-static INLINE void get_level_mag(const uint8_t *const levels, const int stride,
-                                 const int row, const int col, int *const mag) {
-  for (int idx = 0; idx < CONTEXT_MAG_POSITION_NUM; ++idx) {
-    const int ref_row = row + mag_ref_offset[idx][0];
-    const int ref_col = col + mag_ref_offset[idx][1];
-    const int pos = ref_row * stride + ref_col;
-    mag[idx] = levels[pos];
-  }
-}
-
 static INLINE int get_base_ctx_from_count_mag(int row, int col, int count,
                                               int sig_mag) {
   const int ctx = base_level_count_to_index[count];
@@ -267,84 +140,6 @@ static INLINE int get_base_ctx_from_count_mag(int row, int col, int count,
   return ctx_idx;
 }
 
-static INLINE int get_base_ctx(const uint8_t *const levels,
-                               const int c,  // raster order
-                               const int bwl, const int level_minus_1,
-                               const int count) {
-  const int row = c >> bwl;
-  const int col = c - (row << bwl);
-  const int stride = (1 << bwl) + TX_PAD_HOR;
-  int mag_count = 0;
-  int nb_mag[3] = { 0 };
-
-  get_level_mag(levels, stride, row, col, nb_mag);
-
-  for (int idx = 0; idx < 3; ++idx)
-    mag_count += nb_mag[idx] > (level_minus_1 + 1);
-  const int ctx_idx =
-      get_base_ctx_from_count_mag(row, col, count, AOMMIN(2, mag_count));
-  return ctx_idx;
-}
-
-#define BR_CONTEXT_POSITION_NUM 8  // Base range coefficient context
-// Note: TX_PAD_2D is dependent to this offset table.
-static const int br_ref_offset[BR_CONTEXT_POSITION_NUM][2] = {
-  /* clang-format off*/
-  { -1, -1 }, { -1, 0 }, { -1, 1 }, { 0, -1 },
-  { 0, 1 },   { 1, -1 }, { 1, 0 },  { 1, 1 },
-  /* clang-format on*/
-};
-
-static const int br_level_map[9] = {
-  0, 0, 1, 1, 2, 2, 3, 3, 3,
-};
-
-// Note: If BR_MAG_OFFSET changes, the calculation of offset in
-// get_br_ctx_from_count_mag() must be updated.
-#define BR_MAG_OFFSET 1
-// TODO(angiebird): optimize this function by using a table to map from
-// count/mag to ctx
-
-static INLINE int get_br_count_mag(int *mag, const tran_low_t *tcoeffs, int bwl,
-                                   int height, int row, int col, int level) {
-  mag[0] = 0;
-  mag[1] = 0;
-  int count = 0;
-  for (int idx = 0; idx < BR_CONTEXT_POSITION_NUM; ++idx) {
-    const int ref_row = row + br_ref_offset[idx][0];
-    const int ref_col = col + br_ref_offset[idx][1];
-    if (ref_row < 0 || ref_col < 0 || ref_row >= height ||
-        ref_col >= (1 << bwl))
-      continue;
-    const int pos = (ref_row << bwl) + ref_col;
-    tran_low_t abs_coeff = abs(tcoeffs[pos]);
-    count += abs_coeff > level;
-    if (br_ref_offset[idx][0] >= 0 && br_ref_offset[idx][1] >= 0) {
-      if (abs_coeff > mag[0]) {
-        mag[0] = abs_coeff;
-        mag[1] = 1;
-      } else if (abs_coeff == mag[0]) {
-        ++mag[1];
-      }
-    }
-  }
-  return count;
-}
-
-static INLINE int get_br_ctx_from_count_mag(const int row, const int col,
-                                            const int count, const int mag) {
-  // DC: 0 - 1
-  // Top row: 2 - 4
-  // Left column: 5 - 7
-  // others: 8 - 11
-  static const int offset_pos[2][2] = { { 8, 5 }, { 2, 0 } };
-  const int mag_clamp = AOMMIN(mag, 6);
-  const int offset = mag_clamp >> 1;
-  const int ctx =
-      br_level_map[count] + offset * BR_TMP_OFFSET + offset_pos[!row][!col];
-  return ctx;
-}
-
 static INLINE int get_br_ctx_2d(const uint8_t *const levels,
                                 const int c,  // raster order
                                 const int bwl) {
@@ -396,38 +191,6 @@ static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels,
   return mag + 14;
 }
 
-#define SIG_REF_OFFSET_NUM 5
-
-// Note: TX_PAD_2D is dependent to these offset tables.
-static const int sig_ref_offset[SIG_REF_OFFSET_NUM][2] = {
-  { 0, 1 }, { 1, 0 }, { 1, 1 }, { 0, 2 }, { 2, 0 }
-  // , { 1, 2 }, { 2, 1 },
-};
-
-static const int sig_ref_offset_vert[SIG_REF_OFFSET_NUM][2] = {
-  { 1, 0 }, { 2, 0 }, { 0, 1 }, { 3, 0 }, { 4, 0 }
-  // , { 1, 1 }, { 2, 1 },
-};
-
-static const int sig_ref_offset_horiz[SIG_REF_OFFSET_NUM][2] = {
-  { 0, 1 }, { 0, 2 }, { 1, 0 }, { 0, 3 }, { 0, 4 }
-  // , { 1, 1 }, { 1, 2 },
-};
-
-#define SIG_REF_DIFF_OFFSET_NUM 3
-
-static const int sig_ref_diff_offset[SIG_REF_DIFF_OFFSET_NUM][2] = {
-  { 1, 1 }, { 0, 2 }, { 2, 0 }
-};
-
-static const int sig_ref_diff_offset_vert[SIG_REF_DIFF_OFFSET_NUM][2] = {
-  { 2, 0 }, { 3, 0 }, { 4, 0 }
-};
-
-static const int sig_ref_diff_offset_horiz[SIG_REF_DIFF_OFFSET_NUM][2] = {
-  { 0, 2 }, { 0, 3 }, { 0, 4 }
-};
-
 static const uint8_t clip_max3[256] = {
   0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
@@ -658,4 +421,4 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
 
 void av1_init_lv_map(AV1_COMMON *cm);
 
-#endif  // AV1_COMMON_TXB_COMMON_H_
+#endif  // AOM_AV1_COMMON_TXB_COMMON_H_
diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c
index 412d83ed8..4144c4389 100644
--- a/third_party/aom/av1/common/warped_motion.c
+++ b/third_party/aom/av1/common/warped_motion.c
@@ -562,7 +562,7 @@ static int64_t highbd_warp_error(
   const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
   uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
 
-  ConvolveParams conv_params = get_conv_params(0, 0, 0, bd);
+  ConvolveParams conv_params = get_conv_params(0, 0, bd);
   conv_params.use_jnt_comp_avg = 0;
   for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
     for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
@@ -845,7 +845,7 @@ static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
   int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
   int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
   uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
-  ConvolveParams conv_params = get_conv_params(0, 0, 0, 8);
+  ConvolveParams conv_params = get_conv_params(0, 0, 8);
   conv_params.use_jnt_comp_avg = 0;
 
   for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h
index ce4032ee5..a1a4f067d 100644
--- a/third_party/aom/av1/common/warped_motion.h
+++ b/third_party/aom/av1/common/warped_motion.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_WARPED_MOTION_H_
-#define AV1_COMMON_WARPED_MOTION_H_
+#ifndef AOM_AV1_COMMON_WARPED_MOTION_H_
+#define AOM_AV1_COMMON_WARPED_MOTION_H_
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -92,4 +92,4 @@ int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
                     int mi_col);
 
 int get_shear_params(WarpedMotionParams *wm);
-#endif  // AV1_COMMON_WARPED_MOTION_H_
+#endif  // AOM_AV1_COMMON_WARPED_MOTION_H_
diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
index 0c5286f9d..d9fb53785 100644
--- a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
@@ -14,7 +14,6 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-#include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "av1/common/convolve.h"
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
index ae331b40d..5db2ccf6c 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
@@ -18,6 +18,12 @@
 #include "av1/common/x86/av1_inv_txfm_avx2.h"
 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
 
+// TODO(venkatsanampudi@ittiam.com): move this to header file
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+                                          4 * 5793 };
+
 static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
                                       const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
index 7b5b29cf8..f74cbaeaa 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
@@ -8,8 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
-#define AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
 
 #include <immintrin.h>
 
@@ -68,4 +68,4 @@ void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
 }
 #endif
 
-#endif  // AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#endif  // AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
index dd7cee24c..995bc3da4 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -16,6 +16,12 @@
 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
 #include "av1/common/x86/av1_txfm_sse2.h"
 
+// TODO(venkatsanampudi@ittiam.com): move this to header file
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+                                          4 * 5793 };
+
 // TODO(binpengsmail@gmail.com): replace some for loop with do {} while
 
 static void idct4_new_sse2(const __m128i *input, __m128i *output,
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
index dc9be25d2..66bd339d1 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -8,8 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
-#define AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
 
 #include <emmintrin.h>  // SSE2
 #include <tmmintrin.h>  // SSSE3
@@ -94,10 +94,6 @@ static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
   IIDENTITY_1D, IADST_1D,     IIDENTITY_1D, IFLIPADST_1D,
 };
 
-// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
-static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
-                                          4 * 5793 };
-
 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
   0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
 };
@@ -233,4 +229,4 @@ void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#endif  // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse2.h b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
index 721cfe059..77aeb6eb1 100644
--- a/third_party/aom/av1/common/x86/av1_txfm_sse2.h
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
@@ -8,8 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AV1_COMMON_X86_AV1_TXFM_SSE2_H_
-#define AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
 
 #include <emmintrin.h>  // SSE2
 
@@ -314,4 +314,4 @@ typedef struct {
 #ifdef __cplusplus
 }
 #endif  // __cplusplus
-#endif  // AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#endif  // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
index 367e02096..6cad821b1 100644
--- a/third_party/aom/av1/common/x86/av1_txfm_sse4.h
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_TXFM_SSE4_H_
-#define AV1_TXFM_SSE4_H_
+#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
+#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
 
 #include <smmintrin.h>
 
@@ -45,8 +45,9 @@ static INLINE void av1_round_shift_array_32_sse4_1(__m128i *input,
 static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input,
                                                         __m128i *output,
                                                         const int size,
-                                                        const int bit) {
-  const __m128i sqrt2 = _mm_set1_epi32(NewSqrt2);
+                                                        const int bit,
+                                                        const int val) {
+  const __m128i sqrt2 = _mm_set1_epi32(val);
   if (bit > 0) {
     int i;
     for (i = 0; i < size; i++) {
@@ -68,4 +69,4 @@ static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input,
 }
 #endif
 
-#endif  // AV1_TXFM_SSE4_H_
+#endif  // AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/cfl_simd.h b/third_party/aom/av1/common/x86/cfl_simd.h
index 7479ac3e1..3b342cd4e 100644
--- a/third_party/aom/av1/common/x86/cfl_simd.h
+++ b/third_party/aom/av1/common/x86/cfl_simd.h
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_
+#define AOM_AV1_COMMON_X86_CFL_SIMD_H_
+
 #include "av1/common/blockd.h"
 
 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
@@ -236,3 +239,5 @@ void predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
                              int dst_stride, int alpha_q3, int bd);
 void predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
                              int dst_stride, int alpha_q3, int bd);
+
+#endif  // AOM_AV1_COMMON_X86_CFL_SIMD_H_
diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
index 1099144fe..0acafd044 100644
--- a/third_party/aom/av1/common/x86/convolve_2d_avx2.c
+++ b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
@@ -11,10 +11,8 @@
 
 #include <immintrin.h>
 
-#include "config/aom_dsp_rtcd.h"
 #include "config/av1_rtcd.h"
 
-#include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_dsp/x86/convolve_common_intrin.h"
 #include "aom_dsp/aom_dsp_common.h"
diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
index 637f83cf7..b1a62a4f6 100644
--- a/third_party/aom/av1/common/x86/convolve_2d_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
@@ -11,9 +11,8 @@
 
 #include <emmintrin.h>
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
-#include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_sse2.h"
diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c
index f66dee37d..5016642de 100644
--- a/third_party/aom/av1/common/x86/convolve_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_sse2.c
@@ -11,9 +11,8 @@
 
 #include <emmintrin.h>
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
-#include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_common_intrin.h"
@@ -76,8 +75,8 @@ static INLINE __m128i convolve_hi_y(const __m128i *const s,
   return convolve(ss, coeffs);
 }
 
-void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride,
-                            const uint8_t *dst, int dst_stride, int w, int h,
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
                             const int subpel_x_q4, const int subpel_y_q4,
@@ -237,8 +236,8 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride,
   }
 }
 
-void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride,
-                            const uint8_t *dst, int dst_stride, int w, int h,
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
                             const int subpel_x_q4, const int subpel_y_q4,
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
index 8444ffa93..ae68f0bbb 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -14,7 +14,6 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-#include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/aom_dsp_common.h"
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
index eb340523a..3f8dafb4b 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -15,7 +15,6 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-#include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_sse2.h"
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
index 33183fdee..1d029db39 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -14,7 +14,6 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-#include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_sse2.h"
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
index debb05a6d..ade2af03e 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -15,6 +15,9 @@
 #include "config/av1_rtcd.h"
 
 #include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
 
 // Note:
 //  Total 32x4 registers to represent 32x32 block coefficients.
@@ -27,131 +30,125 @@
 //   ... ...
 //   v124, v125, v126, v127
 
-static void transpose_32x32_8x8(const __m256i *in, __m256i *out) {
+static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
+  __m256i clamped, mask;
+
+  mask = _mm256_cmpgt_epi16(u, max);
+  clamped = _mm256_andnot_si256(mask, u);
+  mask = _mm256_and_si256(mask, max);
+  clamped = _mm256_or_si256(mask, clamped);
+  mask = _mm256_cmpgt_epi16(clamped, zero);
+  clamped = _mm256_and_si256(clamped, mask);
+
+  return clamped;
+}
+
+static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
+                                                 __m256i res0, __m256i res1,
+                                                 const int bd) {
+  __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred));
+  __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1));
+
+  x0 = _mm256_add_epi32(res0, x0);
+  x1 = _mm256_add_epi32(res1, x1);
+  x0 = _mm256_packus_epi32(x0, x1);
+  x0 = _mm256_permute4x64_epi64(x0, 0xd8);
+  x0 = highbd_clamp_epi16_avx2(x0, bd);
+  return x0;
+}
+
+static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
+                                                 int stride, int flipud,
+                                                 int height, const int bd) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride));
+    __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd);
+
+    _mm256_storeu_si256((__m256i *)(output + i * stride), u);
+  }
+}
+
+static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) {
+  __m256i tmp, round;
+  round = _mm256_set1_epi32(1 << (bit - 1));
+  tmp = _mm256_add_epi32(vec, round);
+  return _mm256_srai_epi32(tmp, bit);
+}
+
+static INLINE void av1_round_shift_array_32_avx2(__m256i *input,
+                                                 __m256i *output,
+                                                 const int size,
+                                                 const int bit) {
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = av1_round_shift_32_avx2(input[i], bit);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = _mm256_slli_epi32(input[i], -bit);
+    }
+  }
+}
+
+static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
   __m256i u0, u1, u2, u3, u4, u5, u6, u7;
   __m256i x0, x1;
 
-  u0 = _mm256_unpacklo_epi32(in[0], in[4]);
-  u1 = _mm256_unpackhi_epi32(in[0], in[4]);
+  u0 = _mm256_unpacklo_epi32(in[0], in[1]);
+  u1 = _mm256_unpackhi_epi32(in[0], in[1]);
 
-  u2 = _mm256_unpacklo_epi32(in[8], in[12]);
-  u3 = _mm256_unpackhi_epi32(in[8], in[12]);
+  u2 = _mm256_unpacklo_epi32(in[2], in[3]);
+  u3 = _mm256_unpackhi_epi32(in[2], in[3]);
 
-  u4 = _mm256_unpacklo_epi32(in[16], in[20]);
-  u5 = _mm256_unpackhi_epi32(in[16], in[20]);
+  u4 = _mm256_unpacklo_epi32(in[4], in[5]);
+  u5 = _mm256_unpackhi_epi32(in[4], in[5]);
 
-  u6 = _mm256_unpacklo_epi32(in[24], in[28]);
-  u7 = _mm256_unpackhi_epi32(in[24], in[28]);
+  u6 = _mm256_unpacklo_epi32(in[6], in[7]);
+  u7 = _mm256_unpackhi_epi32(in[6], in[7]);
 
   x0 = _mm256_unpacklo_epi64(u0, u2);
   x1 = _mm256_unpacklo_epi64(u4, u6);
   out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
-  out[16] = _mm256_permute2f128_si256(x0, x1, 0x31);
+  out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
 
   x0 = _mm256_unpackhi_epi64(u0, u2);
   x1 = _mm256_unpackhi_epi64(u4, u6);
-  out[4] = _mm256_permute2f128_si256(x0, x1, 0x20);
-  out[20] = _mm256_permute2f128_si256(x0, x1, 0x31);
+  out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
 
   x0 = _mm256_unpacklo_epi64(u1, u3);
   x1 = _mm256_unpacklo_epi64(u5, u7);
-  out[8] = _mm256_permute2f128_si256(x0, x1, 0x20);
-  out[24] = _mm256_permute2f128_si256(x0, x1, 0x31);
+  out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
 
   x0 = _mm256_unpackhi_epi64(u1, u3);
   x1 = _mm256_unpackhi_epi64(u5, u7);
-  out[12] = _mm256_permute2f128_si256(x0, x1, 0x20);
-  out[28] = _mm256_permute2f128_si256(x0, x1, 0x31);
-}
-
-static void transpose_32x32_16x16(const __m256i *in, __m256i *out) {
-  transpose_32x32_8x8(&in[0], &out[0]);
-  transpose_32x32_8x8(&in[1], &out[32]);
-  transpose_32x32_8x8(&in[32], &out[1]);
-  transpose_32x32_8x8(&in[33], &out[33]);
-}
-
-static void transpose_32x32(const __m256i *in, __m256i *out) {
-  transpose_32x32_16x16(&in[0], &out[0]);
-  transpose_32x32_16x16(&in[2], &out[64]);
-  transpose_32x32_16x16(&in[64], &out[2]);
-  transpose_32x32_16x16(&in[66], &out[66]);
+  out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
 }
 
-static void load_buffer_32x32(const int32_t *coeff, __m256i *in) {
+static void load_buffer_32x32(const int32_t *coeff, __m256i *in,
+                              int input_stiride, int size) {
   int i;
-  for (i = 0; i < 128; ++i) {
-    in[i] = _mm256_loadu_si256((const __m256i *)coeff);
-    coeff += 8;
+  for (i = 0; i < size; ++i) {
+    in[i] = _mm256_loadu_si256((const __m256i *)(coeff + i * input_stiride));
   }
 }
 
-static __m256i highbd_clamp_epi32(__m256i x, int bd) {
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
-  __m256i clamped, mask;
-
-  mask = _mm256_cmpgt_epi16(x, max);
-  clamped = _mm256_andnot_si256(mask, x);
-  mask = _mm256_and_si256(mask, max);
-  clamped = _mm256_or_si256(mask, clamped);
-  mask = _mm256_cmpgt_epi16(clamped, zero);
-  clamped = _mm256_and_si256(clamped, mask);
-
-  return clamped;
-}
-
-static void write_buffer_32x32(__m256i *in, uint16_t *output, int stride,
-                               int fliplr, int flipud, int shift, int bd) {
-  __m256i u0, u1, x0, x1, x2, x3, v0, v1, v2, v3;
-  const __m256i zero = _mm256_setzero_si256();
-  int i = 0;
-  (void)fliplr;
-  (void)flipud;
-
-  __m256i round = _mm256_set1_epi32((1 << shift) >> 1);
-
-  while (i < 128) {
-    u0 = _mm256_loadu_si256((const __m256i *)output);
-    u1 = _mm256_loadu_si256((const __m256i *)(output + 16));
-
-    x0 = _mm256_unpacklo_epi16(u0, zero);
-    x1 = _mm256_unpackhi_epi16(u0, zero);
-    x2 = _mm256_unpacklo_epi16(u1, zero);
-    x3 = _mm256_unpackhi_epi16(u1, zero);
-
-    v0 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x20);
-    v1 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x31);
-    v2 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x20);
-    v3 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x31);
-
-    v0 = _mm256_add_epi32(v0, round);
-    v1 = _mm256_add_epi32(v1, round);
-    v2 = _mm256_add_epi32(v2, round);
-    v3 = _mm256_add_epi32(v3, round);
-
-    v0 = _mm256_sra_epi32(v0, _mm_cvtsi32_si128(shift));
-    v1 = _mm256_sra_epi32(v1, _mm_cvtsi32_si128(shift));
-    v2 = _mm256_sra_epi32(v2, _mm_cvtsi32_si128(shift));
-    v3 = _mm256_sra_epi32(v3, _mm_cvtsi32_si128(shift));
-
-    v0 = _mm256_add_epi32(v0, x0);
-    v1 = _mm256_add_epi32(v1, x1);
-    v2 = _mm256_add_epi32(v2, x2);
-    v3 = _mm256_add_epi32(v3, x3);
-
-    v0 = _mm256_packus_epi32(v0, v1);
-    v2 = _mm256_packus_epi32(v2, v3);
-
-    v0 = highbd_clamp_epi32(v0, bd);
-    v2 = highbd_clamp_epi32(v2, bd);
-
-    _mm256_storeu_si256((__m256i *)output, v0);
-    _mm256_storeu_si256((__m256i *)(output + 16), v2);
-    output += stride;
-    i += 4;
-  }
+static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
+                                      const __m256i *rounding, int bit) {
+  __m256i x;
+  x = _mm256_mullo_epi32(*w0, *n0);
+  x = _mm256_add_epi32(x, *rounding);
+  x = _mm256_srai_epi32(x, bit);
+  return x;
 }
 
 static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
@@ -200,18 +197,549 @@ static void addsub_shift_avx2(const __m256i in0, const __m256i in1,
   __m256i a0 = _mm256_add_epi32(in0_w_offset, in1);
   __m256i a1 = _mm256_sub_epi32(in0_w_offset, in1);
 
+  a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+  a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
   a0 = _mm256_max_epi32(a0, *clamp_lo);
   a0 = _mm256_min_epi32(a0, *clamp_hi);
   a1 = _mm256_max_epi32(a1, *clamp_lo);
   a1 = _mm256_min_epi32(a1, *clamp_hi);
 
-  a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
-  a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
-
   *out0 = a0;
   *out1 = a1;
 }
 
+static INLINE void idct32_stage4_avx2(
+    __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
+    const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
+    const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24,
+    const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
+  bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
+  bf1[17] = temp1;
+
+  temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
+  bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
+  bf1[18] = temp2;
+
+  temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
+  bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
+  bf1[21] = temp1;
+
+  temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
+  bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
+  bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_avx2(
+    __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48,
+    const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo,
+    const __m256i *clamp_hi, const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
+  bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
+  bf1[9] = temp1;
+
+  temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
+  bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
+  bf1[10] = temp2;
+
+  addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_avx2(
+    __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32,
+    const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
+    const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
+    const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+  bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+  bf1[5] = temp1;
+
+  addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
+  bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
+  bf1[18] = temp1;
+  temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
+  bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
+  bf1[19] = temp2;
+  temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
+  bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
+  bf1[20] = temp1;
+  temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
+  bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
+  bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
+                                      const __m256i *cospi32,
+                                      const __m256i *clamp_lo,
+                                      const __m256i *clamp_hi,
+                                      const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+  bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+  bf1[10] = temp1;
+  temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+  bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+  bf1[11] = temp2;
+
+  addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
+                                      const __m256i *cospi32,
+                                      const __m256i *clamp_lo,
+                                      const __m256i *clamp_hi,
+                                      const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+  bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+  bf1[20] = temp1;
+  temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+  bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+  bf1[21] = temp2;
+  temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+  bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+  bf1[22] = temp1;
+  temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+  bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+  bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
+                                      const int do_cols, const int bd,
+                                      const int out_shift,
+                                      const int log_range) {
+  if (do_cols) {
+    addsub_no_clamp_avx2(bf1[0], bf1[31], out + 0, out + 31);
+    addsub_no_clamp_avx2(bf1[1], bf1[30], out + 1, out + 30);
+    addsub_no_clamp_avx2(bf1[2], bf1[29], out + 2, out + 29);
+    addsub_no_clamp_avx2(bf1[3], bf1[28], out + 3, out + 28);
+    addsub_no_clamp_avx2(bf1[4], bf1[27], out + 4, out + 27);
+    addsub_no_clamp_avx2(bf1[5], bf1[26], out + 5, out + 26);
+    addsub_no_clamp_avx2(bf1[6], bf1[25], out + 6, out + 25);
+    addsub_no_clamp_avx2(bf1[7], bf1[24], out + 7, out + 24);
+    addsub_no_clamp_avx2(bf1[8], bf1[23], out + 8, out + 23);
+    addsub_no_clamp_avx2(bf1[9], bf1[22], out + 9, out + 22);
+    addsub_no_clamp_avx2(bf1[10], bf1[21], out + 10, out + 21);
+    addsub_no_clamp_avx2(bf1[11], bf1[20], out + 11, out + 20);
+    addsub_no_clamp_avx2(bf1[12], bf1[19], out + 12, out + 19);
+    addsub_no_clamp_avx2(bf1[13], bf1[18], out + 13, out + 18);
+    addsub_no_clamp_avx2(bf1[14], bf1[17], out + 14, out + 17);
+    addsub_no_clamp_avx2(bf1[15], bf1[16], out + 15, out + 16);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    addsub_shift_avx2(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+  }
+}
+
+static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i x;
+  // stage 0
+  // stage 1
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  x = _mm256_mullo_epi32(in[0], cospi32);
+  x = _mm256_add_epi32(x, rounding);
+  x = _mm256_srai_epi32(x, bit);
+
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+  if (do_cols) {
+    x = _mm256_max_epi32(x, clamp_lo);
+    x = _mm256_min_epi32(x, clamp_hi);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+    __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+    x = _mm256_add_epi32(offset, x);
+    x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+    x = _mm256_max_epi32(x, clamp_lo_out);
+    x = _mm256_min_epi32(x, clamp_hi_out);
+  }
+
+  out[0] = x;
+  out[1] = x;
+  out[2] = x;
+  out[3] = x;
+  out[4] = x;
+  out[5] = x;
+  out[6] = x;
+  out[7] = x;
+  out[8] = x;
+  out[9] = x;
+  out[10] = x;
+  out[11] = x;
+  out[12] = x;
+  out[13] = x;
+  out[14] = x;
+  out[15] = x;
+  out[16] = x;
+  out[17] = x;
+  out[18] = x;
+  out[19] = x;
+  out[20] = x;
+  out[21] = x;
+  out[22] = x;
+  out[23] = x;
+  out[24] = x;
+  out[25] = x;
+  out[26] = x;
+  out[27] = x;
+  out[28] = x;
+  out[29] = x;
+  out[30] = x;
+  out[31] = x;
+}
+
+static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i bf1[32];
+
+  {
+    // stage 0
+    // stage 1
+    bf1[0] = in[0];
+    bf1[4] = in[4];
+    bf1[8] = in[2];
+    bf1[12] = in[6];
+    bf1[16] = in[1];
+    bf1[20] = in[5];
+    bf1[24] = in[3];
+    bf1[28] = in[7];
+
+    // stage 2
+    bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
+    bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
+    bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
+    bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
+    bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
+    bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
+    bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
+    bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
+
+    // stage 3
+    bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
+    bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
+
+    bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
+    bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
+    bf1[17] = bf1[16];
+    bf1[18] = bf1[19];
+    bf1[21] = bf1[20];
+    bf1[22] = bf1[23];
+    bf1[25] = bf1[24];
+    bf1[26] = bf1[27];
+    bf1[29] = bf1[28];
+    bf1[30] = bf1[31];
+
+    // stage 4
+    bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
+    bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
+
+    bf1[9] = bf1[8];
+    bf1[10] = bf1[11];
+    bf1[13] = bf1[12];
+    bf1[14] = bf1[15];
+
+    idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+                       &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+    // stage 5
+    bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
+    bf1[1] = bf1[0];
+    bf1[5] = bf1[4];
+    bf1[6] = bf1[7];
+
+    idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+                       &clamp_hi, &rounding, bit);
+
+    // stage 6
+    bf1[3] = bf1[0];
+    bf1[2] = bf1[1];
+
+    idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+    // stage 7
+    idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+    // stage 8
+    idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+    // stage 9
+    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
+  }
+}
+
+static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i bf1[32];
+
+  {
+    // stage 0
+    // stage 1
+    bf1[0] = in[0];
+    bf1[2] = in[8];
+    bf1[4] = in[4];
+    bf1[6] = in[12];
+    bf1[8] = in[2];
+    bf1[10] = in[10];
+    bf1[12] = in[6];
+    bf1[14] = in[14];
+    bf1[16] = in[1];
+    bf1[18] = in[9];
+    bf1[20] = in[5];
+    bf1[22] = in[13];
+    bf1[24] = in[3];
+    bf1[26] = in[11];
+    bf1[28] = in[7];
+    bf1[30] = in[15];
+
+    // stage 2
+    bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
+    bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
+    bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit);
+    bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit);
+    bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit);
+    bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit);
+    bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
+    bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
+    bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
+    bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
+    bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit);
+    bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit);
+    bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit);
+    bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit);
+    bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
+    bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
+
+    // stage 3
+    bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
+    bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
+    bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit);
+    bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit);
+    bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit);
+    bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit);
+    bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
+    bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
+
+    addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+    // stage 4
+    bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
+    bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
+    bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit);
+    bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit);
+
+    addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+    idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+                       &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+    // stage 5
+    bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
+    bf1[1] = bf1[0];
+    bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit);
+    bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit);
+
+    addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+
+    idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+                       &clamp_hi, &rounding, bit);
+
+    // stage 6
+    addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
+
+    idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+    // stage 7
+    idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+    // stage 8
+    idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+    // stage 9
+    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
+  }
+}
+
 static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
                         int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
@@ -270,43 +798,42 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
   const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
   const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
   __m256i bf1[32], bf0[32];
-  int col;
 
-  for (col = 0; col < 4; ++col) {
+  {
     // stage 0
     // stage 1
-    bf1[0] = in[0 * 4 + col];
-    bf1[1] = in[16 * 4 + col];
-    bf1[2] = in[8 * 4 + col];
-    bf1[3] = in[24 * 4 + col];
-    bf1[4] = in[4 * 4 + col];
-    bf1[5] = in[20 * 4 + col];
-    bf1[6] = in[12 * 4 + col];
-    bf1[7] = in[28 * 4 + col];
-    bf1[8] = in[2 * 4 + col];
-    bf1[9] = in[18 * 4 + col];
-    bf1[10] = in[10 * 4 + col];
-    bf1[11] = in[26 * 4 + col];
-    bf1[12] = in[6 * 4 + col];
-    bf1[13] = in[22 * 4 + col];
-    bf1[14] = in[14 * 4 + col];
-    bf1[15] = in[30 * 4 + col];
-    bf1[16] = in[1 * 4 + col];
-    bf1[17] = in[17 * 4 + col];
-    bf1[18] = in[9 * 4 + col];
-    bf1[19] = in[25 * 4 + col];
-    bf1[20] = in[5 * 4 + col];
-    bf1[21] = in[21 * 4 + col];
-    bf1[22] = in[13 * 4 + col];
-    bf1[23] = in[29 * 4 + col];
-    bf1[24] = in[3 * 4 + col];
-    bf1[25] = in[19 * 4 + col];
-    bf1[26] = in[11 * 4 + col];
-    bf1[27] = in[27 * 4 + col];
-    bf1[28] = in[7 * 4 + col];
-    bf1[29] = in[23 * 4 + col];
-    bf1[30] = in[15 * 4 + col];
-    bf1[31] = in[31 * 4 + col];
+    bf1[0] = in[0];
+    bf1[1] = in[16];
+    bf1[2] = in[8];
+    bf1[3] = in[24];
+    bf1[4] = in[4];
+    bf1[5] = in[20];
+    bf1[6] = in[12];
+    bf1[7] = in[28];
+    bf1[8] = in[2];
+    bf1[9] = in[18];
+    bf1[10] = in[10];
+    bf1[11] = in[26];
+    bf1[12] = in[6];
+    bf1[13] = in[22];
+    bf1[14] = in[14];
+    bf1[15] = in[30];
+    bf1[16] = in[1];
+    bf1[17] = in[17];
+    bf1[18] = in[9];
+    bf1[19] = in[25];
+    bf1[20] = in[5];
+    bf1[21] = in[21];
+    bf1[22] = in[13];
+    bf1[23] = in[29];
+    bf1[24] = in[3];
+    bf1[25] = in[19];
+    bf1[26] = in[11];
+    bf1[27] = in[27];
+    bf1[28] = in[7];
+    bf1[29] = in[23];
+    bf1[30] = in[15];
+    bf1[31] = in[31];
 
     // stage 2
     bf0[0] = bf1[0];
@@ -568,91 +1095,255 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
 
     // stage 9
     if (do_cols) {
-      addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0 * 4 + col,
-                           out + 31 * 4 + col);
-      addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1 * 4 + col,
-                           out + 30 * 4 + col);
-      addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2 * 4 + col,
-                           out + 29 * 4 + col);
-      addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3 * 4 + col,
-                           out + 28 * 4 + col);
-      addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4 * 4 + col,
-                           out + 27 * 4 + col);
-      addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5 * 4 + col,
-                           out + 26 * 4 + col);
-      addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6 * 4 + col,
-                           out + 25 * 4 + col);
-      addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7 * 4 + col,
-                           out + 24 * 4 + col);
-      addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8 * 4 + col,
-                           out + 23 * 4 + col);
-      addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9 * 4 + col,
-                           out + 22 * 4 + col);
-      addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10 * 4 + col,
-                           out + 21 * 4 + col);
-      addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11 * 4 + col,
-                           out + 20 * 4 + col);
-      addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12 * 4 + col,
-                           out + 19 * 4 + col);
-      addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13 * 4 + col,
-                           out + 18 * 4 + col);
-      addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14 * 4 + col,
-                           out + 17 * 4 + col);
-      addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15 * 4 + col,
-                           out + 16 * 4 + col);
+      addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0, out + 31);
+      addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1, out + 30);
+      addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2, out + 29);
+      addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3, out + 28);
+      addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4, out + 27);
+      addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5, out + 26);
+      addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6, out + 25);
+      addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7, out + 24);
+      addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8, out + 23);
+      addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9, out + 22);
+      addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10, out + 21);
+      addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11, out + 20);
+      addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12, out + 19);
+      addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13, out + 18);
+      addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14, out + 17);
+      addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15, out + 16);
     } else {
-      addsub_shift_avx2(bf0[0], bf0[31], out + 0 * 4 + col, out + 31 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[1], bf0[30], out + 1 * 4 + col, out + 30 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[2], bf0[29], out + 2 * 4 + col, out + 29 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[3], bf0[28], out + 3 * 4 + col, out + 28 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[4], bf0[27], out + 4 * 4 + col, out + 27 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[5], bf0[26], out + 5 * 4 + col, out + 26 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[6], bf0[25], out + 6 * 4 + col, out + 25 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[7], bf0[24], out + 7 * 4 + col, out + 24 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[8], bf0[23], out + 8 * 4 + col, out + 23 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[9], bf0[22], out + 9 * 4 + col, out + 22 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[10], bf0[21], out + 10 * 4 + col,
-                        out + 21 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[11], bf0[20], out + 11 * 4 + col,
-                        out + 20 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[12], bf0[19], out + 12 * 4 + col,
-                        out + 19 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[13], bf0[18], out + 13 * 4 + col,
-                        out + 18 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[14], bf0[17], out + 14 * 4 + col,
-                        out + 17 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[15], bf0[16], out + 15 * 4 + col,
-                        out + 16 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+      addsub_shift_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
     }
   }
 }
 
-void av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff, uint16_t *output,
-                                   int stride, TX_TYPE tx_type, int bd) {
-  __m256i in[128], out[128];
-  const int8_t *shift = inv_txfm_shift_ls[TX_32X32];
-  const int txw_idx = get_txw_idx(TX_32X32);
-  const int txh_idx = get_txh_idx(TX_32X32);
+typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit,
+                                  int do_cols, int bd, int out_shift);
+
+static const transform_1d_avx2
+    highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      {
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+
+      { { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
+                                                   uint16_t *output, int stride,
+                                                   TX_TYPE tx_type,
+                                                   TX_SIZE tx_size, int eob,
+                                                   const int bd) {
+  __m256i buf1[64 * 2];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_avx2 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_avx2 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 
+  // 1st stage: column transform
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    __m256i buf0[32];
+    const int32_t *input_row = input + i * input_stride * 8;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      __m256i *buf0_cur = buf0 + j * 8;
+      load_buffer_32x32(input_row + j * 8, buf0_cur, input_stride, 8);
+
+      transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]);
+    }
+
+    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+    __m256i *_buf1 = buf1 + i * 8;
+    for (int j = 0; j < buf_size_w_div8; ++j) {
+      transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
+    }
+  }
+  // 2nd stage: column transform
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
+                                  buf1 + i * txfm_size_row, txfm_size_row,
+                                  -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
+                                    output + 16 * i, stride, ud_flip,
+                                    txfm_size_row, bd);
+    }
+  }
+}
+
+void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
+                                             uint8_t *output, int stride,
+                                             TX_TYPE tx_type, TX_SIZE tx_size,
+                                             int eob, const int bd) {
   switch (tx_type) {
     case DCT_DCT:
-      load_buffer_32x32(coeff, in);
-      transpose_32x32(in, out);
-      idct32_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
-      transpose_32x32(in, out);
-      idct32_avx2(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_32x32(in, output, stride, 0, 0, -shift[1], bd);
+      highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
+                                             stride, tx_type, tx_size, eob, bd);
       break;
+    default: assert(0); break;
+  }
+}
+
+void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  const int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
+                                              txfm_param->tx_size,
+                                              txfm_param->eob, bd);
+      break;
+      // Assembly version doesn't support IDTX, so use C version for it.
+    case IDTX:
+      av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                 tx_type, bd);
+      break;
+
     default: assert(0);
   }
 }
+
+void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  switch (tx_size) {
+    case TX_32X32:
+      av1_highbd_inv_txfm_add_32x32_avx2(input, dest, stride, txfm_param);
+      break;
+    case TX_16X16:
+      av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_8X8:
+      av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_4X8:
+      av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+      break;
+    case TX_8X4:
+      av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+      break;
+    case TX_8X16:
+      av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X8:
+      av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X32:
+      av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+      break;
+    case TX_32X16:
+      av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+      break;
+    case TX_32X64:
+      av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+      break;
+    case TX_64X32:
+      av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+      break;
+    case TX_4X4:
+      av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X4:
+      av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+      break;
+    case TX_4X16:
+      av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+      break;
+    case TX_8X32:
+      av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+      break;
+    case TX_32X8:
+      av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+      break;
+    case TX_64X64:
+    case TX_16X64:
+    case TX_64X16:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(
+          input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
+          txfm_param->eob, txfm_param->bd);
+      break;
+    default: assert(0 && "Invalid transform size"); break;
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
index 801a4133b..e29e0baf5 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -15,8 +15,60 @@
 #include "config/av1_rtcd.h"
 
 #include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
 
+static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i clamped, mask;
+
+  mask = _mm_cmpgt_epi16(u, max);
+  clamped = _mm_andnot_si128(mask, u);
+  mask = _mm_and_si128(mask, max);
+  clamped = _mm_or_si128(mask, clamped);
+  mask = _mm_cmpgt_epi16(clamped, zero);
+  clamped = _mm_and_si128(clamped, mask);
+
+  return clamped;
+}
+
+static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
+                                                  __m128i res0, __m128i res1,
+                                                  const int bd) {
+  __m128i x0 = _mm_cvtepi16_epi32(pred);
+  __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
+
+  x0 = _mm_add_epi32(res0, x0);
+  x1 = _mm_add_epi32(res1, x1);
+  x0 = _mm_packus_epi32(x0, x1);
+  x0 = highbd_clamp_epi16(x0, bd);
+  return x0;
+}
+
+static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
+                                                  int stride, int flipud,
+                                                  int height, const int bd) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
+    __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd);
+
+    _mm_storeu_si128((__m128i *)(output + i * stride), u);
+  }
+}
+
+static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
+                                           __m128i *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
+  }
+}
+
 static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
   in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
   in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
@@ -57,18 +109,231 @@ static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1,
   __m128i a0 = _mm_add_epi32(in0_w_offset, in1);
   __m128i a1 = _mm_sub_epi32(in0_w_offset, in1);
 
+  a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+  a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
   a0 = _mm_max_epi32(a0, *clamp_lo);
   a0 = _mm_min_epi32(a0, *clamp_hi);
   a1 = _mm_max_epi32(a1, *clamp_lo);
   a1 = _mm_min_epi32(a1, *clamp_hi);
 
-  a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
-  a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
-
   *out0 = a0;
   *out1 = a1;
 }
 
+static INLINE void idct32_stage4_sse4_1(
+    __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56,
+    const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40,
+    const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24,
+    const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
+  bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
+  bf1[17] = temp1;
+
+  temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
+  bf1[29] =
+      half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
+  bf1[18] = temp2;
+
+  temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
+  bf1[26] =
+      half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
+  bf1[21] = temp1;
+
+  temp2 =
+      half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
+  bf1[25] =
+      half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
+  bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_sse4_1(
+    __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48,
+    const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo,
+    const __m128i *clamp_hi, const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
+  bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
+  bf1[9] = temp1;
+
+  temp2 =
+      half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
+  bf1[13] =
+      half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
+  bf1[10] = temp2;
+
+  addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_sse4_1(
+    __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32,
+    const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
+    const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
+    const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+  bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+  bf1[5] = temp1;
+
+  addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
+  bf1[29] =
+      half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
+  bf1[18] = temp1;
+  temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
+  bf1[28] =
+      half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
+  bf1[19] = temp2;
+  temp1 =
+      half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
+  bf1[27] =
+      half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
+  bf1[20] = temp1;
+  temp2 =
+      half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
+  bf1[26] =
+      half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
+  bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32,
+                                        const __m128i *cospi32,
+                                        const __m128i *clamp_lo,
+                                        const __m128i *clamp_hi,
+                                        const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+  bf1[13] =
+      half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+  bf1[10] = temp1;
+  temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+  bf1[12] =
+      half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+  bf1[11] = temp2;
+
+  addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
+                                        const __m128i *cospi32,
+                                        const __m128i *clamp_lo,
+                                        const __m128i *clamp_hi,
+                                        const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+  bf1[27] =
+      half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+  bf1[20] = temp1;
+  temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+  bf1[26] =
+      half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+  bf1[21] = temp2;
+  temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+  bf1[25] =
+      half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+  bf1[22] = temp1;
+  temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+  bf1[24] =
+      half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+  bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
+                                        const int do_cols, const int bd,
+                                        const int out_shift,
+                                        const int log_range) {
+  if (do_cols) {
+    addsub_no_clamp_sse4_1(bf1[0], bf1[31], out + 0, out + 31);
+    addsub_no_clamp_sse4_1(bf1[1], bf1[30], out + 1, out + 30);
+    addsub_no_clamp_sse4_1(bf1[2], bf1[29], out + 2, out + 29);
+    addsub_no_clamp_sse4_1(bf1[3], bf1[28], out + 3, out + 28);
+    addsub_no_clamp_sse4_1(bf1[4], bf1[27], out + 4, out + 27);
+    addsub_no_clamp_sse4_1(bf1[5], bf1[26], out + 5, out + 26);
+    addsub_no_clamp_sse4_1(bf1[6], bf1[25], out + 6, out + 25);
+    addsub_no_clamp_sse4_1(bf1[7], bf1[24], out + 7, out + 24);
+    addsub_no_clamp_sse4_1(bf1[8], bf1[23], out + 8, out + 23);
+    addsub_no_clamp_sse4_1(bf1[9], bf1[22], out + 9, out + 22);
+    addsub_no_clamp_sse4_1(bf1[10], bf1[21], out + 10, out + 21);
+    addsub_no_clamp_sse4_1(bf1[11], bf1[20], out + 11, out + 20);
+    addsub_no_clamp_sse4_1(bf1[12], bf1[19], out + 12, out + 19);
+    addsub_no_clamp_sse4_1(bf1[13], bf1[18], out + 13, out + 18);
+    addsub_no_clamp_sse4_1(bf1[14], bf1[17], out + 14, out + 17);
+    addsub_no_clamp_sse4_1(bf1[15], bf1[16], out + 15, out + 16);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    addsub_shift_sse4_1(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+  }
+}
+
 static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
                              __m128i *out0, __m128i *out1,
                              const __m128i *clamp_lo, const __m128i *clamp_hi,
@@ -77,14 +342,14 @@ static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
   __m128i a0 = _mm_add_epi32(offset, in0);
   __m128i a1 = _mm_sub_epi32(offset, in1);
 
+  a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+  a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
   a0 = _mm_max_epi32(a0, *clamp_lo);
   a0 = _mm_min_epi32(a0, *clamp_hi);
   a1 = _mm_max_epi32(a1, *clamp_lo);
   a1 = _mm_min_epi32(a1, *clamp_hi);
 
-  a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
-  a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
-
   *out0 = a0;
   *out1 = a1;
 }
@@ -96,9 +361,6 @@ static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
-  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
-  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
-  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 
   __m128i u0, u1, u2, u3;
   __m128i v0, v1, v2, v3, x, y;
@@ -135,11 +397,19 @@ static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
   v3 = _mm_add_epi32(v3, rnding);
   v3 = _mm_srai_epi32(v3, bit);
 
-  addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi);
-  addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi);
+  if (do_cols) {
+    addsub_no_clamp_sse4_1(v0, v3, in + 0, in + 3);
+    addsub_no_clamp_sse4_1(v1, v2, in + 1, in + 2);
+  } else {
+    const int log_range = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+    addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi);
+  }
 }
 
-static void iadst4x4_sse4_1(__m128i *in, int bit) {
+static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
   const int32_t *sinpi = sinpi_arr(bit);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
   const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
@@ -197,6 +467,21 @@ static void iadst4x4_sse4_1(__m128i *in, int bit) {
   u3 = _mm_add_epi32(u3, rnding);
   u3 = _mm_srai_epi32(u3, bit);
 
+  if (!do_cols) {
+    const int log_range = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+    u0 = _mm_max_epi32(u0, clamp_lo);
+    u0 = _mm_min_epi32(u0, clamp_hi);
+    u1 = _mm_max_epi32(u1, clamp_lo);
+    u1 = _mm_min_epi32(u1, clamp_hi);
+    u2 = _mm_max_epi32(u2, clamp_lo);
+    u2 = _mm_min_epi32(u2, clamp_hi);
+    u3 = _mm_max_epi32(u3, clamp_lo);
+    u3 = _mm_min_epi32(u3, clamp_hi);
+  }
+
   in[0] = u0;
   in[1] = u1;
   in[2] = u2;
@@ -217,22 +502,6 @@ static INLINE void round_shift_4x4(__m128i *in, int shift) {
   in[3] = _mm_srai_epi32(in[3], shift);
 }
 
-static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
-  __m128i clamped, mask;
-
-  mask = _mm_cmpgt_epi16(u, max);
-  clamped = _mm_andnot_si128(mask, u);
-  mask = _mm_and_si128(mask, max);
-  clamped = _mm_or_si128(mask, clamped);
-  mask = _mm_cmpgt_epi16(clamped, zero);
-  clamped = _mm_and_si128(clamped, mask);
-
-  return clamped;
-}
-
 static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
                              int fliplr, int flipud, int shift, int bd) {
   const __m128i zero = _mm_setzero_si128();
@@ -304,49 +573,49 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
     case ADST_DCT:
       load_buffer_4x4(coeff, in);
       idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
-      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case DCT_ADST:
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
       idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_ADST:
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
-      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case FLIPADST_DCT:
       load_buffer_4x4(coeff, in);
       idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
-      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
       break;
     case DCT_FLIPADST:
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
       idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_FLIPADST:
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
-      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
       write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
       break;
     case ADST_FLIPADST:
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
-      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_ADST:
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
-      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
       break;
     default: assert(0);
@@ -482,14 +751,19 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
       addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col);
       addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col);
     } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
       addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
+                          &clamp_lo_out, &clamp_hi_out, out_shift);
       addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
+                          &clamp_lo_out, &clamp_hi_out, out_shift);
       addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
+                          &clamp_lo_out, &clamp_hi_out, out_shift);
       addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
+                          &clamp_lo_out, &clamp_hi_out, out_shift);
     }
   }
 }
@@ -651,14 +925,18 @@ static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
     out[12] = u[5];
     out[14] = _mm_sub_epi32(kZero, u[1]);
   } else {
-    neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo, &clamp_hi,
-                     out_shift);
-    neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo, &clamp_hi,
-                     out_shift);
-    neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo, &clamp_hi,
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
                      out_shift);
-    neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo, &clamp_hi,
+    neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
                      out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
   }
 
   // Odd 8 points: 1, 3, ..., 15
@@ -796,14 +1074,18 @@ static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
     out[13] = u[5];
     out[15] = _mm_sub_epi32(kZero, u[1]);
   } else {
-    neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo, &clamp_hi,
-                     out_shift);
-    neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo, &clamp_hi,
-                     out_shift);
-    neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo, &clamp_hi,
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
                      out_shift);
-    neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo, &clamp_hi,
+    neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
                      out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
   }
 }
 
@@ -976,81 +1258,51 @@ void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
   }
 }
 
-// 16x16
-static void load_buffer_16x16(const int32_t *coeff, __m128i *in) {
-  int i;
-  for (i = 0; i < 64; ++i) {
-    in[i] = _mm_load_si128((const __m128i *)(coeff + (i << 2)));
-  }
-}
-
-static void assign_8x8_input_from_16x16(const __m128i *in, __m128i *in8x8,
-                                        int col) {
-  int i;
-  for (i = 0; i < 16; i += 2) {
-    in8x8[i] = in[col];
-    in8x8[i + 1] = in[col + 1];
-    col += 4;
-  }
-}
-
-static void swap_addr(uint16_t **output1, uint16_t **output2) {
-  uint16_t *tmp;
-  tmp = *output1;
-  *output1 = *output2;
-  *output2 = tmp;
-}
-
-static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride,
-                               int fliplr, int flipud, int shift, int bd) {
-  __m128i in8x8[16];
-  uint16_t *leftUp = &output[0];
-  uint16_t *rightUp = &output[8];
-  uint16_t *leftDown = &output[8 * stride];
-  uint16_t *rightDown = &output[8 * stride + 8];
+static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                                int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m128i x;
 
-  if (fliplr) {
-    swap_addr(&leftUp, &rightUp);
-    swap_addr(&leftDown, &rightDown);
-  }
+  // stage 0
+  // stage 1
+  // stage 2
+  // stage 3
+  x = _mm_mullo_epi32(in[0], cospi32);
+  x = _mm_add_epi32(x, rnding);
+  x = _mm_srai_epi32(x, bit);
 
-  if (flipud) {
-    swap_addr(&leftUp, &leftDown);
-    swap_addr(&rightUp, &rightDown);
+  // stage 4
+  // stage 5
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+    x = _mm_add_epi32(x, offset);
+    x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+    x = _mm_max_epi32(x, clamp_lo_out);
+    x = _mm_min_epi32(x, clamp_hi_out);
   }
 
-  // Left-up quarter
-  assign_8x8_input_from_16x16(in, in8x8, 0);
-  write_buffer_8x8(in8x8, leftUp, stride, fliplr, flipud, shift, bd);
-
-  // Right-up quarter
-  assign_8x8_input_from_16x16(in, in8x8, 2);
-  write_buffer_8x8(in8x8, rightUp, stride, fliplr, flipud, shift, bd);
-
-  // Left-down quarter
-  assign_8x8_input_from_16x16(in, in8x8, 32);
-  write_buffer_8x8(in8x8, leftDown, stride, fliplr, flipud, shift, bd);
-
-  // Right-down quarter
-  assign_8x8_input_from_16x16(in, in8x8, 34);
-  write_buffer_8x8(in8x8, rightDown, stride, fliplr, flipud, shift, bd);
+  out[0] = x;
+  out[1] = x;
+  out[2] = x;
+  out[3] = x;
+  out[4] = x;
+  out[5] = x;
+  out[6] = x;
+  out[7] = x;
 }
 
-static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
-                             int bd, int out_shift) {
+static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                               int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
-  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
-  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
-  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
-  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
-  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
-  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
-  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
-  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
-  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
-  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
-  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
-  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
@@ -1059,473 +1311,687 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
-  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
-  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
-  __m128i u[16], v[16], x, y;
-  int col;
-
-  for (col = 0; col < 4; ++col) {
-    // stage 0
-    // stage 1
-    u[0] = in[0 * 4 + col];
-    u[1] = in[8 * 4 + col];
-    u[2] = in[4 * 4 + col];
-    u[3] = in[12 * 4 + col];
-    u[4] = in[2 * 4 + col];
-    u[5] = in[10 * 4 + col];
-    u[6] = in[6 * 4 + col];
-    u[7] = in[14 * 4 + col];
-    u[8] = in[1 * 4 + col];
-    u[9] = in[9 * 4 + col];
-    u[10] = in[5 * 4 + col];
-    u[11] = in[13 * 4 + col];
-    u[12] = in[3 * 4 + col];
-    u[13] = in[11 * 4 + col];
-    u[14] = in[7 * 4 + col];
-    u[15] = in[15 * 4 + col];
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
 
-    // stage 2
-    v[0] = u[0];
-    v[1] = u[1];
-    v[2] = u[2];
-    v[3] = u[3];
-    v[4] = u[4];
-    v[5] = u[5];
-    v[6] = u[6];
-    v[7] = u[7];
+  // stage 0
+  // stage 1
+  // stage 2
+  u0 = in[0];
+  u1 = in[4];
+  u2 = in[2];
+  u3 = in[6];
+
+  x = _mm_mullo_epi32(in[1], cospi56);
+  y = _mm_mullo_epi32(in[7], cospim8);
+  u4 = _mm_add_epi32(x, y);
+  u4 = _mm_add_epi32(u4, rnding);
+  u4 = _mm_srai_epi32(u4, bit);
+
+  x = _mm_mullo_epi32(in[1], cospi8);
+  y = _mm_mullo_epi32(in[7], cospi56);
+  u7 = _mm_add_epi32(x, y);
+  u7 = _mm_add_epi32(u7, rnding);
+  u7 = _mm_srai_epi32(u7, bit);
+
+  x = _mm_mullo_epi32(in[5], cospi24);
+  y = _mm_mullo_epi32(in[3], cospim40);
+  u5 = _mm_add_epi32(x, y);
+  u5 = _mm_add_epi32(u5, rnding);
+  u5 = _mm_srai_epi32(u5, bit);
+
+  x = _mm_mullo_epi32(in[5], cospi40);
+  y = _mm_mullo_epi32(in[3], cospi24);
+  u6 = _mm_add_epi32(x, y);
+  u6 = _mm_add_epi32(u6, rnding);
+  u6 = _mm_srai_epi32(u6, bit);
 
-    v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
-    v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
-    v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
-    v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
-    v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
-    v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
-    v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
-    v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
+  // stage 3
+  x = _mm_mullo_epi32(u0, cospi32);
+  y = _mm_mullo_epi32(u1, cospi32);
+  v0 = _mm_add_epi32(x, y);
+  v0 = _mm_add_epi32(v0, rnding);
+  v0 = _mm_srai_epi32(v0, bit);
 
-    // stage 3
-    u[0] = v[0];
-    u[1] = v[1];
-    u[2] = v[2];
-    u[3] = v[3];
-    u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
-    u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
-    u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
-    u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
-    addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+  v1 = _mm_sub_epi32(x, y);
+  v1 = _mm_add_epi32(v1, rnding);
+  v1 = _mm_srai_epi32(v1, bit);
 
-    // stage 4
-    x = _mm_mullo_epi32(u[0], cospi32);
-    y = _mm_mullo_epi32(u[1], cospi32);
-    v[0] = _mm_add_epi32(x, y);
-    v[0] = _mm_add_epi32(v[0], rnding);
-    v[0] = _mm_srai_epi32(v[0], bit);
+  x = _mm_mullo_epi32(u2, cospi48);
+  y = _mm_mullo_epi32(u3, cospim16);
+  v2 = _mm_add_epi32(x, y);
+  v2 = _mm_add_epi32(v2, rnding);
+  v2 = _mm_srai_epi32(v2, bit);
 
-    v[1] = _mm_sub_epi32(x, y);
-    v[1] = _mm_add_epi32(v[1], rnding);
-    v[1] = _mm_srai_epi32(v[1], bit);
+  x = _mm_mullo_epi32(u2, cospi16);
+  y = _mm_mullo_epi32(u3, cospi48);
+  v3 = _mm_add_epi32(x, y);
+  v3 = _mm_add_epi32(v3, rnding);
+  v3 = _mm_srai_epi32(v3, bit);
 
-    v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
-    v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
-    addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
-    v[8] = u[8];
-    v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
-    v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
-    v[11] = u[11];
-    v[12] = u[12];
-    v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
-    v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
-    v[15] = u[15];
+  addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
 
-    // stage 5
-    addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
-    u[4] = v[4];
+  // stage 4
+  addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+  u4 = v4;
+  u7 = v7;
 
-    x = _mm_mullo_epi32(v[5], cospi32);
-    y = _mm_mullo_epi32(v[6], cospi32);
-    u[5] = _mm_sub_epi32(y, x);
-    u[5] = _mm_add_epi32(u[5], rnding);
-    u[5] = _mm_srai_epi32(u[5], bit);
+  x = _mm_mullo_epi32(v5, cospi32);
+  y = _mm_mullo_epi32(v6, cospi32);
+  u6 = _mm_add_epi32(y, x);
+  u6 = _mm_add_epi32(u6, rnding);
+  u6 = _mm_srai_epi32(u6, bit);
 
-    u[6] = _mm_add_epi32(y, x);
-    u[6] = _mm_add_epi32(u[6], rnding);
-    u[6] = _mm_srai_epi32(u[6], bit);
+  u5 = _mm_sub_epi32(y, x);
+  u5 = _mm_add_epi32(u5, rnding);
+  u5 = _mm_srai_epi32(u5, bit);
 
-    u[7] = v[7];
-    addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+  // stage 5
+  if (do_cols) {
+    addsub_no_clamp_sse4_1(u0, u7, out + 0, out + 7);
+    addsub_no_clamp_sse4_1(u1, u6, out + 1, out + 6);
+    addsub_no_clamp_sse4_1(u2, u5, out + 2, out + 5);
+    addsub_no_clamp_sse4_1(u3, u4, out + 3, out + 4);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+    addsub_shift_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out,
+                        out_shift);
+    addsub_shift_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out,
+                        out_shift);
+    addsub_shift_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out,
+                        out_shift);
+    addsub_shift_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out,
+                        out_shift);
+  }
+}
 
-    // stage 6
-    addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
-    v[8] = u[8];
-    v[9] = u[9];
+static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                 int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i kZero = _mm_setzero_si128();
+  __m128i u[8], x;
 
-    x = _mm_mullo_epi32(u[10], cospi32);
-    y = _mm_mullo_epi32(u[13], cospi32);
-    v[10] = _mm_sub_epi32(y, x);
-    v[10] = _mm_add_epi32(v[10], rnding);
-    v[10] = _mm_srai_epi32(v[10], bit);
+  // stage 0
+  // stage 1
+  // stage 2
 
-    v[13] = _mm_add_epi32(x, y);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
+  x = _mm_mullo_epi32(in[0], cospi60);
+  u[0] = _mm_add_epi32(x, rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
 
-    x = _mm_mullo_epi32(u[11], cospi32);
-    y = _mm_mullo_epi32(u[12], cospi32);
-    v[11] = _mm_sub_epi32(y, x);
-    v[11] = _mm_add_epi32(v[11], rnding);
-    v[11] = _mm_srai_epi32(v[11], bit);
+  x = _mm_mullo_epi32(in[0], cospi4);
+  u[1] = _mm_sub_epi32(kZero, x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
 
-    v[12] = _mm_add_epi32(x, y);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
+  // stage 3
+  // stage 4
+  __m128i temp1, temp2;
+  temp1 = _mm_mullo_epi32(u[0], cospi16);
+  x = _mm_mullo_epi32(u[1], cospi48);
+  temp1 = _mm_add_epi32(temp1, x);
+  temp1 = _mm_add_epi32(temp1, rnding);
+  temp1 = _mm_srai_epi32(temp1, bit);
+  u[4] = temp1;
+
+  temp2 = _mm_mullo_epi32(u[0], cospi48);
+  x = _mm_mullo_epi32(u[1], cospi16);
+  u[5] = _mm_sub_epi32(temp2, x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
 
-    v[14] = u[14];
-    v[15] = u[15];
+  // stage 5
+  // stage 6
+  temp1 = _mm_mullo_epi32(u[0], cospi32);
+  x = _mm_mullo_epi32(u[1], cospi32);
+  u[2] = _mm_add_epi32(temp1, x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
 
-    // stage 7
-    if (do_cols) {
-      addsub_no_clamp_sse4_1(v[0], v[15], out + 0 * 4 + col,
-                             out + 15 * 4 + col);
-      addsub_no_clamp_sse4_1(v[1], v[14], out + 1 * 4 + col,
-                             out + 14 * 4 + col);
-      addsub_no_clamp_sse4_1(v[2], v[13], out + 2 * 4 + col,
-                             out + 13 * 4 + col);
-      addsub_no_clamp_sse4_1(v[3], v[12], out + 3 * 4 + col,
-                             out + 12 * 4 + col);
-      addsub_no_clamp_sse4_1(v[4], v[11], out + 4 * 4 + col,
-                             out + 11 * 4 + col);
-      addsub_no_clamp_sse4_1(v[5], v[10], out + 5 * 4 + col,
-                             out + 10 * 4 + col);
-      addsub_no_clamp_sse4_1(v[6], v[9], out + 6 * 4 + col, out + 9 * 4 + col);
-      addsub_no_clamp_sse4_1(v[7], v[8], out + 7 * 4 + col, out + 8 * 4 + col);
-    } else {
-      addsub_shift_sse4_1(v[0], v[15], out + 0 * 4 + col, out + 15 * 4 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_sse4_1(v[1], v[14], out + 1 * 4 + col, out + 14 * 4 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_sse4_1(v[2], v[13], out + 2 * 4 + col, out + 13 * 4 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_sse4_1(v[3], v[12], out + 3 * 4 + col, out + 12 * 4 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_sse4_1(v[4], v[11], out + 4 * 4 + col, out + 11 * 4 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_sse4_1(v[5], v[10], out + 5 * 4 + col, out + 10 * 4 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_sse4_1(v[6], v[9], out + 6 * 4 + col, out + 9 * 4 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_sse4_1(v[7], v[8], out + 7 * 4 + col, out + 8 * 4 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
-    }
+  u[3] = _mm_sub_epi32(temp1, x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  temp1 = _mm_mullo_epi32(u[4], cospi32);
+  x = _mm_mullo_epi32(u[5], cospi32);
+  u[6] = _mm_add_epi32(temp1, x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(temp1, x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = _mm_sub_epi32(kZero, u[4]);
+    out[2] = u[6];
+    out[3] = _mm_sub_epi32(kZero, u[2]);
+    out[4] = u[3];
+    out[5] = _mm_sub_epi32(kZero, u[7]);
+    out[6] = u[5];
+    out[7] = _mm_sub_epi32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
   }
 }
 
-static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
-                              int bd, int out_shift) {
+static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                                int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
-  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
-  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
-  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
-  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
-  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
-  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
-  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
-  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
-  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
-  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
-  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
-  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
-  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
-  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
-  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
-  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
-  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
-  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
-  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
-  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
-  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
-  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
-  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i kZero = _mm_setzero_si128();
   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
-  __m128i u[16], v[16], x, y;
-  const int col_num = 4;
-  int col;
+  __m128i u[8], v[8], x;
 
-  // Calculate the column 0, 1, 2, 3
-  for (col = 0; col < col_num; ++col) {
-    // stage 0
-    // stage 1
-    // stage 2
-    v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2);
-    x = _mm_mullo_epi32(in[0 * col_num + col], cospi62);
-    v[0] = _mm_add_epi32(v[0], x);
-    v[0] = _mm_add_epi32(v[0], rnding);
-    v[0] = _mm_srai_epi32(v[0], bit);
+  // stage 0
+  // stage 1
+  // stage 2
 
-    v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62);
-    x = _mm_mullo_epi32(in[0 * col_num + col], cospi2);
-    v[1] = _mm_sub_epi32(v[1], x);
-    v[1] = _mm_add_epi32(v[1], rnding);
-    v[1] = _mm_srai_epi32(v[1], bit);
+  u[0] = _mm_mullo_epi32(in[7], cospi4);
+  x = _mm_mullo_epi32(in[0], cospi60);
+  u[0] = _mm_add_epi32(u[0], x);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
 
-    v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10);
-    x = _mm_mullo_epi32(in[2 * col_num + col], cospi54);
-    v[2] = _mm_add_epi32(v[2], x);
-    v[2] = _mm_add_epi32(v[2], rnding);
-    v[2] = _mm_srai_epi32(v[2], bit);
+  u[1] = _mm_mullo_epi32(in[7], cospi60);
+  x = _mm_mullo_epi32(in[0], cospi4);
+  u[1] = _mm_sub_epi32(u[1], x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
 
-    v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54);
-    x = _mm_mullo_epi32(in[2 * col_num + col], cospi10);
-    v[3] = _mm_sub_epi32(v[3], x);
-    v[3] = _mm_add_epi32(v[3], rnding);
-    v[3] = _mm_srai_epi32(v[3], bit);
+  // (2)
+  u[2] = _mm_mullo_epi32(in[5], cospi20);
+  x = _mm_mullo_epi32(in[2], cospi44);
+  u[2] = _mm_add_epi32(u[2], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
 
-    v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18);
-    x = _mm_mullo_epi32(in[4 * col_num + col], cospi46);
-    v[4] = _mm_add_epi32(v[4], x);
-    v[4] = _mm_add_epi32(v[4], rnding);
-    v[4] = _mm_srai_epi32(v[4], bit);
+  u[3] = _mm_mullo_epi32(in[5], cospi44);
+  x = _mm_mullo_epi32(in[2], cospi20);
+  u[3] = _mm_sub_epi32(u[3], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
 
-    v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46);
-    x = _mm_mullo_epi32(in[4 * col_num + col], cospi18);
-    v[5] = _mm_sub_epi32(v[5], x);
-    v[5] = _mm_add_epi32(v[5], rnding);
-    v[5] = _mm_srai_epi32(v[5], bit);
+  // (3)
+  u[4] = _mm_mullo_epi32(in[3], cospi36);
+  x = _mm_mullo_epi32(in[4], cospi28);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
 
-    v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26);
-    x = _mm_mullo_epi32(in[6 * col_num + col], cospi38);
-    v[6] = _mm_add_epi32(v[6], x);
-    v[6] = _mm_add_epi32(v[6], rnding);
-    v[6] = _mm_srai_epi32(v[6], bit);
+  u[5] = _mm_mullo_epi32(in[3], cospi28);
+  x = _mm_mullo_epi32(in[4], cospi36);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
 
-    v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38);
-    x = _mm_mullo_epi32(in[6 * col_num + col], cospi26);
-    v[7] = _mm_sub_epi32(v[7], x);
-    v[7] = _mm_add_epi32(v[7], rnding);
-    v[7] = _mm_srai_epi32(v[7], bit);
+  // (4)
+  u[6] = _mm_mullo_epi32(in[1], cospi52);
+  x = _mm_mullo_epi32(in[6], cospi12);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
 
-    v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34);
-    x = _mm_mullo_epi32(in[8 * col_num + col], cospi30);
-    v[8] = _mm_add_epi32(v[8], x);
-    v[8] = _mm_add_epi32(v[8], rnding);
-    v[8] = _mm_srai_epi32(v[8], bit);
+  u[7] = _mm_mullo_epi32(in[1], cospi12);
+  x = _mm_mullo_epi32(in[6], cospi52);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
 
-    v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30);
-    x = _mm_mullo_epi32(in[8 * col_num + col], cospi34);
-    v[9] = _mm_sub_epi32(v[9], x);
-    v[9] = _mm_add_epi32(v[9], rnding);
-    v[9] = _mm_srai_epi32(v[9], bit);
+  // stage 3
+  addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
 
-    v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42);
-    x = _mm_mullo_epi32(in[10 * col_num + col], cospi22);
-    v[10] = _mm_add_epi32(v[10], x);
-    v[10] = _mm_add_epi32(v[10], rnding);
-    v[10] = _mm_srai_epi32(v[10], bit);
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
 
-    v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22);
-    x = _mm_mullo_epi32(in[10 * col_num + col], cospi42);
-    v[11] = _mm_sub_epi32(v[11], x);
-    v[11] = _mm_add_epi32(v[11], rnding);
-    v[11] = _mm_srai_epi32(v[11], bit);
+  u[4] = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
 
-    v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50);
-    x = _mm_mullo_epi32(in[12 * col_num + col], cospi14);
-    v[12] = _mm_add_epi32(v[12], x);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
+  u[5] = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
 
-    v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14);
-    x = _mm_mullo_epi32(in[12 * col_num + col], cospi50);
-    v[13] = _mm_sub_epi32(v[13], x);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
+  u[6] = _mm_mullo_epi32(v[6], cospim48);
+  x = _mm_mullo_epi32(v[7], cospi16);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
 
-    v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58);
-    x = _mm_mullo_epi32(in[14 * col_num + col], cospi6);
-    v[14] = _mm_add_epi32(v[14], x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
+  u[7] = _mm_mullo_epi32(v[6], cospi16);
+  x = _mm_mullo_epi32(v[7], cospim48);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
 
-    v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6);
-    x = _mm_mullo_epi32(in[14 * col_num + col], cospi58);
-    v[15] = _mm_sub_epi32(v[15], x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
+  // stage 5
+  addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
 
-    // stage 3
-    addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
 
-    // stage 4
-    v[0] = u[0];
-    v[1] = u[1];
-    v[2] = u[2];
-    v[3] = u[3];
-    v[4] = u[4];
-    v[5] = u[5];
-    v[6] = u[6];
-    v[7] = u[7];
+  v[0] = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  u[2] = _mm_add_epi32(v[0], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
 
-    v[8] = _mm_mullo_epi32(u[8], cospi8);
-    x = _mm_mullo_epi32(u[9], cospi56);
-    v[8] = _mm_add_epi32(v[8], x);
-    v[8] = _mm_add_epi32(v[8], rnding);
-    v[8] = _mm_srai_epi32(v[8], bit);
+  u[3] = _mm_sub_epi32(v[0], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
 
-    v[9] = _mm_mullo_epi32(u[8], cospi56);
-    x = _mm_mullo_epi32(u[9], cospi8);
-    v[9] = _mm_sub_epi32(v[9], x);
-    v[9] = _mm_add_epi32(v[9], rnding);
-    v[9] = _mm_srai_epi32(v[9], bit);
+  v[0] = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  u[6] = _mm_add_epi32(v[0], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
 
-    v[10] = _mm_mullo_epi32(u[10], cospi40);
-    x = _mm_mullo_epi32(u[11], cospi24);
-    v[10] = _mm_add_epi32(v[10], x);
-    v[10] = _mm_add_epi32(v[10], rnding);
-    v[10] = _mm_srai_epi32(v[10], bit);
+  u[7] = _mm_sub_epi32(v[0], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
 
-    v[11] = _mm_mullo_epi32(u[10], cospi24);
-    x = _mm_mullo_epi32(u[11], cospi40);
-    v[11] = _mm_sub_epi32(v[11], x);
-    v[11] = _mm_add_epi32(v[11], rnding);
-    v[11] = _mm_srai_epi32(v[11], bit);
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = _mm_sub_epi32(kZero, u[4]);
+    out[2] = u[6];
+    out[3] = _mm_sub_epi32(kZero, u[2]);
+    out[4] = u[3];
+    out[5] = _mm_sub_epi32(kZero, u[7]);
+    out[6] = u[5];
+    out[7] = _mm_sub_epi32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 
-    v[12] = _mm_mullo_epi32(u[12], cospim56);
-    x = _mm_mullo_epi32(u[13], cospi8);
-    v[12] = _mm_add_epi32(v[12], x);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
+    neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+  }
+}
 
-    v[13] = _mm_mullo_epi32(u[12], cospi8);
-    x = _mm_mullo_epi32(u[13], cospim56);
-    v[13] = _mm_sub_epi32(v[13], x);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
+static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 
-    v[14] = _mm_mullo_epi32(u[14], cospim24);
-    x = _mm_mullo_epi32(u[15], cospi40);
-    v[14] = _mm_add_epi32(v[14], x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    // stage 3
+    // stage 4
+    in[0] = _mm_mullo_epi32(in[0], cospi32);
+    in[0] = _mm_add_epi32(in[0], rnding);
+    in[0] = _mm_srai_epi32(in[0], bit);
 
-    v[15] = _mm_mullo_epi32(u[14], cospi40);
-    x = _mm_mullo_epi32(u[15], cospim24);
-    v[15] = _mm_sub_epi32(v[15], x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
+    // stage 5
+    // stage 6
+    // stage 7
+    if (do_cols) {
+      in[0] = _mm_max_epi32(in[0], clamp_lo);
+      in[0] = _mm_min_epi32(in[0], clamp_hi);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+      __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+      in[0] = _mm_add_epi32(in[0], offset);
+      in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
+      in[0] = _mm_max_epi32(in[0], clamp_lo_out);
+      in[0] = _mm_min_epi32(in[0], clamp_hi_out);
+    }
+
+    out[0] = in[0];
+    out[1] = in[0];
+    out[2] = in[0];
+    out[3] = in[0];
+    out[4] = in[0];
+    out[5] = in[0];
+    out[6] = in[0];
+    out[7] = in[0];
+    out[8] = in[0];
+    out[9] = in[0];
+    out[10] = in[0];
+    out[11] = in[0];
+    out[12] = in[0];
+    out[13] = in[0];
+    out[14] = in[0];
+    out[15] = in[0];
+  }
+}
+
+static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[16], x, y;
+
+  {
+    // stage 0
+    // stage 1
+    u[0] = in[0];
+    u[2] = in[4];
+    u[4] = in[2];
+    u[6] = in[6];
+    u[8] = in[1];
+    u[10] = in[5];
+    u[12] = in[3];
+    u[14] = in[7];
+
+    // stage 2
+    u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+    u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+
+    u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+    u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
+
+    u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+    u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
+
+    u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+    u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+
+    // stage 3
+    u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+    u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+    u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
+    u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
+
+    addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    x = _mm_mullo_epi32(u[0], cospi32);
+    u[0] = _mm_add_epi32(x, rnding);
+    u[0] = _mm_srai_epi32(u[0], bit);
+    u[1] = u[0];
+
+    u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+    u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+
+    addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
+
+    x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    u[9] = x;
+    y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    u[10] = y;
 
     // stage 5
-    addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+    x = _mm_mullo_epi32(u[5], cospi32);
+    y = _mm_mullo_epi32(u[6], cospi32);
+    u[5] = _mm_sub_epi32(y, x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_add_epi32(y, x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
 
     // stage 6
-    v[0] = u[0];
-    v[1] = u[1];
-    v[2] = u[2];
-    v[3] = u[3];
+    addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
 
-    v[4] = _mm_mullo_epi32(u[4], cospi16);
-    x = _mm_mullo_epi32(u[5], cospi48);
-    v[4] = _mm_add_epi32(v[4], x);
-    v[4] = _mm_add_epi32(v[4], rnding);
-    v[4] = _mm_srai_epi32(v[4], bit);
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[13], cospi32);
+    u[10] = _mm_sub_epi32(y, x);
+    u[10] = _mm_add_epi32(u[10], rnding);
+    u[10] = _mm_srai_epi32(u[10], bit);
 
-    v[5] = _mm_mullo_epi32(u[4], cospi48);
-    x = _mm_mullo_epi32(u[5], cospi16);
-    v[5] = _mm_sub_epi32(v[5], x);
-    v[5] = _mm_add_epi32(v[5], rnding);
-    v[5] = _mm_srai_epi32(v[5], bit);
+    u[13] = _mm_add_epi32(x, y);
+    u[13] = _mm_add_epi32(u[13], rnding);
+    u[13] = _mm_srai_epi32(u[13], bit);
 
-    v[6] = _mm_mullo_epi32(u[6], cospim48);
-    x = _mm_mullo_epi32(u[7], cospi16);
-    v[6] = _mm_add_epi32(v[6], x);
-    v[6] = _mm_add_epi32(v[6], rnding);
-    v[6] = _mm_srai_epi32(v[6], bit);
+    x = _mm_mullo_epi32(u[11], cospi32);
+    y = _mm_mullo_epi32(u[12], cospi32);
+    u[11] = _mm_sub_epi32(y, x);
+    u[11] = _mm_add_epi32(u[11], rnding);
+    u[11] = _mm_srai_epi32(u[11], bit);
 
-    v[7] = _mm_mullo_epi32(u[6], cospi16);
-    x = _mm_mullo_epi32(u[7], cospim48);
-    v[7] = _mm_sub_epi32(v[7], x);
-    v[7] = _mm_add_epi32(v[7], rnding);
-    v[7] = _mm_srai_epi32(v[7], bit);
+    u[12] = _mm_add_epi32(x, y);
+    u[12] = _mm_add_epi32(u[12], rnding);
+    u[12] = _mm_srai_epi32(u[12], bit);
+    // stage 7
+    if (do_cols) {
+      addsub_no_clamp_sse4_1(u[0], u[15], out + 0, out + 15);
+      addsub_no_clamp_sse4_1(u[1], u[14], out + 1, out + 14);
+      addsub_no_clamp_sse4_1(u[2], u[13], out + 2, out + 13);
+      addsub_no_clamp_sse4_1(u[3], u[12], out + 3, out + 12);
+      addsub_no_clamp_sse4_1(u[4], u[11], out + 4, out + 11);
+      addsub_no_clamp_sse4_1(u[5], u[10], out + 5, out + 10);
+      addsub_no_clamp_sse4_1(u[6], u[9], out + 6, out + 9);
+      addsub_no_clamp_sse4_1(u[7], u[8], out + 7, out + 8);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+      addsub_shift_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+    }
+  }
+}
 
-    v[8] = u[8];
-    v[9] = u[9];
-    v[10] = u[10];
-    v[11] = u[11];
+static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                   int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i v[16], x, y, temp1, temp2;
 
-    v[12] = _mm_mullo_epi32(u[12], cospi16);
-    x = _mm_mullo_epi32(u[13], cospi48);
-    v[12] = _mm_add_epi32(v[12], x);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
+  // Calculate the column 0, 1, 2, 3
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    x = _mm_mullo_epi32(in[0], cospi62);
+    v[0] = _mm_add_epi32(x, rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
 
-    v[13] = _mm_mullo_epi32(u[12], cospi48);
-    x = _mm_mullo_epi32(u[13], cospi16);
-    v[13] = _mm_sub_epi32(v[13], x);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
+    x = _mm_mullo_epi32(in[0], cospi2);
+    v[1] = _mm_sub_epi32(zero, x);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
 
-    v[14] = _mm_mullo_epi32(u[14], cospim48);
-    x = _mm_mullo_epi32(u[15], cospi16);
-    v[14] = _mm_add_epi32(v[14], x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
+    // stage 3
+    v[8] = v[0];
+    v[9] = v[1];
 
-    v[15] = _mm_mullo_epi32(u[14], cospi16);
-    x = _mm_mullo_epi32(u[15], cospim48);
-    v[15] = _mm_sub_epi32(v[15], x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
+    // stage 4
+    temp1 = _mm_mullo_epi32(v[8], cospi8);
+    x = _mm_mullo_epi32(v[9], cospi56);
+    temp1 = _mm_add_epi32(temp1, x);
+    temp1 = _mm_add_epi32(temp1, rnding);
+    temp1 = _mm_srai_epi32(temp1, bit);
+
+    temp2 = _mm_mullo_epi32(v[8], cospi56);
+    x = _mm_mullo_epi32(v[9], cospi8);
+    temp2 = _mm_sub_epi32(temp2, x);
+    temp2 = _mm_add_epi32(temp2, rnding);
+    temp2 = _mm_srai_epi32(temp2, bit);
+    v[8] = temp1;
+    v[9] = temp2;
+
+    // stage 5
+    v[4] = v[0];
+    v[5] = v[1];
+    v[12] = v[8];
+    v[13] = v[9];
+
+    // stage 6
+    temp1 = _mm_mullo_epi32(v[4], cospi16);
+    x = _mm_mullo_epi32(v[5], cospi48);
+    temp1 = _mm_add_epi32(temp1, x);
+    temp1 = _mm_add_epi32(temp1, rnding);
+    temp1 = _mm_srai_epi32(temp1, bit);
+
+    temp2 = _mm_mullo_epi32(v[4], cospi48);
+    x = _mm_mullo_epi32(v[5], cospi16);
+    temp2 = _mm_sub_epi32(temp2, x);
+    temp2 = _mm_add_epi32(temp2, rnding);
+    temp2 = _mm_srai_epi32(temp2, bit);
+    v[4] = temp1;
+    v[5] = temp2;
+
+    temp1 = _mm_mullo_epi32(v[12], cospi16);
+    x = _mm_mullo_epi32(v[13], cospi48);
+    temp1 = _mm_add_epi32(temp1, x);
+    temp1 = _mm_add_epi32(temp1, rnding);
+    temp1 = _mm_srai_epi32(temp1, bit);
+
+    temp2 = _mm_mullo_epi32(v[12], cospi48);
+    x = _mm_mullo_epi32(v[13], cospi16);
+    temp2 = _mm_sub_epi32(temp2, x);
+    temp2 = _mm_add_epi32(temp2, rnding);
+    temp2 = _mm_srai_epi32(temp2, bit);
+    v[12] = temp1;
+    v[13] = temp2;
 
     // stage 7
-    addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+    v[2] = v[0];
+    v[3] = v[1];
+    v[6] = v[4];
+    v[7] = v[5];
+    v[10] = v[8];
+    v[11] = v[9];
+    v[14] = v[12];
+    v[15] = v[13];
 
     // stage 8
-    v[0] = u[0];
-    v[1] = u[1];
-
-    y = _mm_mullo_epi32(u[2], cospi32);
-    x = _mm_mullo_epi32(u[3], cospi32);
+    y = _mm_mullo_epi32(v[2], cospi32);
+    x = _mm_mullo_epi32(v[3], cospi32);
     v[2] = _mm_add_epi32(y, x);
     v[2] = _mm_add_epi32(v[2], rnding);
     v[2] = _mm_srai_epi32(v[2], bit);
@@ -1534,11 +2000,8 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
     v[3] = _mm_add_epi32(v[3], rnding);
     v[3] = _mm_srai_epi32(v[3], bit);
 
-    v[4] = u[4];
-    v[5] = u[5];
-
-    y = _mm_mullo_epi32(u[6], cospi32);
-    x = _mm_mullo_epi32(u[7], cospi32);
+    y = _mm_mullo_epi32(v[6], cospi32);
+    x = _mm_mullo_epi32(v[7], cospi32);
     v[6] = _mm_add_epi32(y, x);
     v[6] = _mm_add_epi32(v[6], rnding);
     v[6] = _mm_srai_epi32(v[6], bit);
@@ -1547,11 +2010,8 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
     v[7] = _mm_add_epi32(v[7], rnding);
     v[7] = _mm_srai_epi32(v[7], bit);
 
-    v[8] = u[8];
-    v[9] = u[9];
-
-    y = _mm_mullo_epi32(u[10], cospi32);
-    x = _mm_mullo_epi32(u[11], cospi32);
+    y = _mm_mullo_epi32(v[10], cospi32);
+    x = _mm_mullo_epi32(v[11], cospi32);
     v[10] = _mm_add_epi32(y, x);
     v[10] = _mm_add_epi32(v[10], rnding);
     v[10] = _mm_srai_epi32(v[10], bit);
@@ -1560,11 +2020,8 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
     v[11] = _mm_add_epi32(v[11], rnding);
     v[11] = _mm_srai_epi32(v[11], bit);
 
-    v[12] = u[12];
-    v[13] = u[13];
-
-    y = _mm_mullo_epi32(u[14], cospi32);
-    x = _mm_mullo_epi32(u[15], cospi32);
+    y = _mm_mullo_epi32(v[14], cospi32);
+    x = _mm_mullo_epi32(v[15], cospi32);
     v[14] = _mm_add_epi32(y, x);
     v[14] = _mm_add_epi32(v[14], rnding);
     v[14] = _mm_srai_epi32(v[14], bit);
@@ -1575,439 +2032,1904 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
 
     // stage 9
     if (do_cols) {
-      out[0 * col_num + col] = v[0];
-      out[1 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
-      out[2 * col_num + col] = v[12];
-      out[3 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
-      out[4 * col_num + col] = v[6];
-      out[5 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
-      out[6 * col_num + col] = v[10];
-      out[7 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
-      out[8 * col_num + col] = v[3];
-      out[9 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
-      out[10 * col_num + col] = v[15];
-      out[11 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
-      out[12 * col_num + col] = v[5];
-      out[13 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
-      out[14 * col_num + col] = v[9];
-      out[15 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
+      out[0] = v[0];
+      out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
+      out[2] = v[12];
+      out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
+      out[4] = v[6];
+      out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
+      out[6] = v[10];
+      out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
+      out[8] = v[3];
+      out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
+      out[10] = v[15];
+      out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
+      out[12] = v[5];
+      out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
+      out[14] = v[9];
+      out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
     } else {
-      neg_shift_sse4_1(v[0], v[8], out + 0 * col_num + col,
-                       out + 1 * col_num + col, &clamp_lo, &clamp_hi,
-                       out_shift);
-      neg_shift_sse4_1(v[12], v[4], out + 2 * col_num + col,
-                       out + 3 * col_num + col, &clamp_lo, &clamp_hi,
-                       out_shift);
-      neg_shift_sse4_1(v[6], v[14], out + 4 * col_num + col,
-                       out + 5 * col_num + col, &clamp_lo, &clamp_hi,
-                       out_shift);
-      neg_shift_sse4_1(v[10], v[2], out + 6 * col_num + col,
-                       out + 7 * col_num + col, &clamp_lo, &clamp_hi,
-                       out_shift);
-      neg_shift_sse4_1(v[3], v[11], out + 8 * col_num + col,
-                       out + 9 * col_num + col, &clamp_lo, &clamp_hi,
-                       out_shift);
-      neg_shift_sse4_1(v[15], v[7], out + 10 * col_num + col,
-                       out + 11 * col_num + col, &clamp_lo, &clamp_hi,
-                       out_shift);
-      neg_shift_sse4_1(v[5], v[13], out + 12 * col_num + col,
-                       out + 13 * col_num + col, &clamp_lo, &clamp_hi,
-                       out_shift);
-      neg_shift_sse4_1(v[9], v[1], out + 14 * col_num + col,
-                       out + 15 * col_num + col, &clamp_lo, &clamp_hi,
-                       out_shift);
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m128i clamp_hi_out =
+          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+      neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
     }
   }
 }
 
-void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
-                                     int stride, TX_TYPE tx_type, int bd) {
-  __m128i in[64], out[64];
-  const int8_t *shift = inv_txfm_shift_ls[TX_16X16];
-  const int txw_idx = get_txw_idx(TX_16X16);
-  const int txh_idx = get_txh_idx(TX_16X16);
+static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+                                   int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[16], x, y;
 
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_16x16(coeff, in);
-      transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                       -shift[0]);
-      transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
-      break;
-    case DCT_ADST:
-      load_buffer_16x16(coeff, in);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                        -shift[0]);
-      transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
-      break;
-    case ADST_DCT:
-      load_buffer_16x16(coeff, in);
-      transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                       -shift[0]);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
-      break;
-    case ADST_ADST:
-      load_buffer_16x16(coeff, in);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                        -shift[0]);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
-      break;
-    case FLIPADST_DCT:
-      load_buffer_16x16(coeff, in);
-      transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                       -shift[0]);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_16x16(coeff, in);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                        -shift[0]);
-      transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_16x16(coeff, in);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                        -shift[0]);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_16x16(coeff, in);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                        -shift[0]);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_16x16(in, output, stride, 1, 1, -shift[1], bd);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_16x16(coeff, in);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                        -shift[0]);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd);
-      break;
-    default: assert(0);
-  }
-}
+  // Calculate the column 0, 1, 2, 3
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    __m128i zero = _mm_setzero_si128();
+    x = _mm_mullo_epi32(in[0], cospi62);
+    u[0] = _mm_add_epi32(x, rnding);
+    u[0] = _mm_srai_epi32(u[0], bit);
+
+    x = _mm_mullo_epi32(in[0], cospi2);
+    u[1] = _mm_sub_epi32(zero, x);
+    u[1] = _mm_add_epi32(u[1], rnding);
+    u[1] = _mm_srai_epi32(u[1], bit);
+
+    x = _mm_mullo_epi32(in[2], cospi54);
+    u[2] = _mm_add_epi32(x, rnding);
+    u[2] = _mm_srai_epi32(u[2], bit);
+
+    x = _mm_mullo_epi32(in[2], cospi10);
+    u[3] = _mm_sub_epi32(zero, x);
+    u[3] = _mm_add_epi32(u[3], rnding);
+    u[3] = _mm_srai_epi32(u[3], bit);
+
+    x = _mm_mullo_epi32(in[4], cospi46);
+    u[4] = _mm_add_epi32(x, rnding);
+    u[4] = _mm_srai_epi32(u[4], bit);
+
+    x = _mm_mullo_epi32(in[4], cospi18);
+    u[5] = _mm_sub_epi32(zero, x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
 
-static void load_buffer_64x64_lower_32x32(const int32_t *coeff, __m128i *in) {
-  int i, j;
+    x = _mm_mullo_epi32(in[6], cospi38);
+    u[6] = _mm_add_epi32(x, rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
 
-  __m128i zero = _mm_setzero_si128();
+    x = _mm_mullo_epi32(in[6], cospi26);
+    u[7] = _mm_sub_epi32(zero, x);
+    u[7] = _mm_add_epi32(u[7], rnding);
+    u[7] = _mm_srai_epi32(u[7], bit);
 
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 8; ++j) {
-      in[16 * i + j] =
-          _mm_loadu_si128((const __m128i *)(coeff + 32 * i + 4 * j));
-      in[16 * i + j + 8] = zero;
-    }
-  }
+    u[8] = _mm_mullo_epi32(in[7], cospi34);
+    u[8] = _mm_add_epi32(u[8], rnding);
+    u[8] = _mm_srai_epi32(u[8], bit);
 
-  for (i = 0; i < 512; ++i) in[512 + i] = zero;
-}
+    u[9] = _mm_mullo_epi32(in[7], cospi30);
+    u[9] = _mm_add_epi32(u[9], rnding);
+    u[9] = _mm_srai_epi32(u[9], bit);
 
-static void transpose_64x64(__m128i *in, __m128i *out, int do_cols) {
-  int i, j;
-  for (i = 0; i < (do_cols ? 16 : 8); ++i) {
-    for (j = 0; j < 8; ++j) {
-      TRANSPOSE_4X4(in[(4 * i + 0) * 16 + j], in[(4 * i + 1) * 16 + j],
-                    in[(4 * i + 2) * 16 + j], in[(4 * i + 3) * 16 + j],
-                    out[(4 * j + 0) * 16 + i], out[(4 * j + 1) * 16 + i],
-                    out[(4 * j + 2) * 16 + i], out[(4 * j + 3) * 16 + i]);
-    }
-  }
-}
+    u[10] = _mm_mullo_epi32(in[5], cospi42);
+    u[10] = _mm_add_epi32(u[10], rnding);
+    u[10] = _mm_srai_epi32(u[10], bit);
 
-static void assign_16x16_input_from_32x32(const __m128i *in, __m128i *in16x16,
-                                          int col) {
-  int i;
-  for (i = 0; i < 16 * 16 / 4; i += 4) {
-    in16x16[i] = in[col];
-    in16x16[i + 1] = in[col + 1];
-    in16x16[i + 2] = in[col + 2];
-    in16x16[i + 3] = in[col + 3];
-    col += 8;
-  }
-}
+    u[11] = _mm_mullo_epi32(in[5], cospi22);
+    u[11] = _mm_add_epi32(u[11], rnding);
+    u[11] = _mm_srai_epi32(u[11], bit);
 
-static void write_buffer_32x32(__m128i *in, uint16_t *output, int stride,
-                               int fliplr, int flipud, int shift, int bd) {
-  __m128i in16x16[16 * 16 / 4];
-  uint16_t *leftUp = &output[0];
-  uint16_t *rightUp = &output[16];
-  uint16_t *leftDown = &output[16 * stride];
-  uint16_t *rightDown = &output[16 * stride + 16];
+    u[12] = _mm_mullo_epi32(in[3], cospi50);
+    u[12] = _mm_add_epi32(u[12], rnding);
+    u[12] = _mm_srai_epi32(u[12], bit);
 
-  if (fliplr) {
-    swap_addr(&leftUp, &rightUp);
-    swap_addr(&leftDown, &rightDown);
-  }
+    u[13] = _mm_mullo_epi32(in[3], cospi14);
+    u[13] = _mm_add_epi32(u[13], rnding);
+    u[13] = _mm_srai_epi32(u[13], bit);
 
-  if (flipud) {
-    swap_addr(&leftUp, &leftDown);
-    swap_addr(&rightUp, &rightDown);
-  }
+    u[14] = _mm_mullo_epi32(in[1], cospi58);
+    u[14] = _mm_add_epi32(u[14], rnding);
+    u[14] = _mm_srai_epi32(u[14], bit);
 
-  // Left-up quarter
-  assign_16x16_input_from_32x32(in, in16x16, 0);
-  write_buffer_16x16(in16x16, leftUp, stride, fliplr, flipud, shift, bd);
+    u[15] = _mm_mullo_epi32(in[1], cospi6);
+    u[15] = _mm_add_epi32(u[15], rnding);
+    u[15] = _mm_srai_epi32(u[15], bit);
 
-  // Right-up quarter
-  assign_16x16_input_from_32x32(in, in16x16, 32 / 2 / 4);
-  write_buffer_16x16(in16x16, rightUp, stride, fliplr, flipud, shift, bd);
+    // stage 3
+    addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
 
-  // Left-down quarter
-  assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4);
-  write_buffer_16x16(in16x16, leftDown, stride, fliplr, flipud, shift, bd);
+    // stage 4
+    y = _mm_mullo_epi32(u[8], cospi56);
+    x = _mm_mullo_epi32(u[9], cospi56);
+    u[8] = _mm_mullo_epi32(u[8], cospi8);
+    u[8] = _mm_add_epi32(u[8], x);
+    u[8] = _mm_add_epi32(u[8], rnding);
+    u[8] = _mm_srai_epi32(u[8], bit);
 
-  // Right-down quarter
-  assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4 + 32 / 2 / 4);
-  write_buffer_16x16(in16x16, rightDown, stride, fliplr, flipud, shift, bd);
-}
+    x = _mm_mullo_epi32(u[9], cospi8);
+    u[9] = _mm_sub_epi32(y, x);
+    u[9] = _mm_add_epi32(u[9], rnding);
+    u[9] = _mm_srai_epi32(u[9], bit);
 
-static void assign_32x32_input_from_64x64(const __m128i *in, __m128i *in32x32,
-                                          int col) {
-  int i;
-  for (i = 0; i < 32 * 32 / 4; i += 8) {
-    in32x32[i] = in[col];
-    in32x32[i + 1] = in[col + 1];
-    in32x32[i + 2] = in[col + 2];
-    in32x32[i + 3] = in[col + 3];
-    in32x32[i + 4] = in[col + 4];
-    in32x32[i + 5] = in[col + 5];
-    in32x32[i + 6] = in[col + 6];
-    in32x32[i + 7] = in[col + 7];
-    col += 16;
-  }
-}
+    x = _mm_mullo_epi32(u[11], cospi24);
+    y = _mm_mullo_epi32(u[10], cospi24);
+    u[10] = _mm_mullo_epi32(u[10], cospi40);
+    u[10] = _mm_add_epi32(u[10], x);
+    u[10] = _mm_add_epi32(u[10], rnding);
+    u[10] = _mm_srai_epi32(u[10], bit);
 
-static void write_buffer_64x64(__m128i *in, uint16_t *output, int stride,
-                               int fliplr, int flipud, int shift, int bd) {
-  __m128i in32x32[32 * 32 / 4];
-  uint16_t *leftUp = &output[0];
-  uint16_t *rightUp = &output[32];
-  uint16_t *leftDown = &output[32 * stride];
-  uint16_t *rightDown = &output[32 * stride + 32];
+    x = _mm_mullo_epi32(u[11], cospi40);
+    u[11] = _mm_sub_epi32(y, x);
+    u[11] = _mm_add_epi32(u[11], rnding);
+    u[11] = _mm_srai_epi32(u[11], bit);
 
-  if (fliplr) {
-    swap_addr(&leftUp, &rightUp);
-    swap_addr(&leftDown, &rightDown);
-  }
+    x = _mm_mullo_epi32(u[13], cospi8);
+    y = _mm_mullo_epi32(u[12], cospi8);
+    u[12] = _mm_mullo_epi32(u[12], cospim56);
+    u[12] = _mm_add_epi32(u[12], x);
+    u[12] = _mm_add_epi32(u[12], rnding);
+    u[12] = _mm_srai_epi32(u[12], bit);
 
-  if (flipud) {
-    swap_addr(&leftUp, &leftDown);
-    swap_addr(&rightUp, &rightDown);
-  }
+    x = _mm_mullo_epi32(u[13], cospim56);
+    u[13] = _mm_sub_epi32(y, x);
+    u[13] = _mm_add_epi32(u[13], rnding);
+    u[13] = _mm_srai_epi32(u[13], bit);
+
+    x = _mm_mullo_epi32(u[15], cospi40);
+    y = _mm_mullo_epi32(u[14], cospi40);
+    u[14] = _mm_mullo_epi32(u[14], cospim24);
+    u[14] = _mm_add_epi32(u[14], x);
+    u[14] = _mm_add_epi32(u[14], rnding);
+    u[14] = _mm_srai_epi32(u[14], bit);
+
+    x = _mm_mullo_epi32(u[15], cospim24);
+    u[15] = _mm_sub_epi32(y, x);
+    u[15] = _mm_add_epi32(u[15], rnding);
+    u[15] = _mm_srai_epi32(u[15], bit);
+
+    // stage 5
+    addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    x = _mm_mullo_epi32(u[5], cospi48);
+    y = _mm_mullo_epi32(u[4], cospi48);
+    u[4] = _mm_mullo_epi32(u[4], cospi16);
+    u[4] = _mm_add_epi32(u[4], x);
+    u[4] = _mm_add_epi32(u[4], rnding);
+    u[4] = _mm_srai_epi32(u[4], bit);
+
+    x = _mm_mullo_epi32(u[5], cospi16);
+    u[5] = _mm_sub_epi32(y, x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    x = _mm_mullo_epi32(u[7], cospi16);
+    y = _mm_mullo_epi32(u[6], cospi16);
+    u[6] = _mm_mullo_epi32(u[6], cospim48);
+    u[6] = _mm_add_epi32(u[6], x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    x = _mm_mullo_epi32(u[7], cospim48);
+    u[7] = _mm_sub_epi32(y, x);
+    u[7] = _mm_add_epi32(u[7], rnding);
+    u[7] = _mm_srai_epi32(u[7], bit);
+
+    x = _mm_mullo_epi32(u[13], cospi48);
+    y = _mm_mullo_epi32(u[12], cospi48);
+    u[12] = _mm_mullo_epi32(u[12], cospi16);
+    u[12] = _mm_add_epi32(u[12], x);
+    u[12] = _mm_add_epi32(u[12], rnding);
+    u[12] = _mm_srai_epi32(u[12], bit);
+
+    x = _mm_mullo_epi32(u[13], cospi16);
+    u[13] = _mm_sub_epi32(y, x);
+    u[13] = _mm_add_epi32(u[13], rnding);
+    u[13] = _mm_srai_epi32(u[13], bit);
+
+    x = _mm_mullo_epi32(u[15], cospi16);
+    y = _mm_mullo_epi32(u[14], cospi16);
+    u[14] = _mm_mullo_epi32(u[14], cospim48);
+    u[14] = _mm_add_epi32(u[14], x);
+    u[14] = _mm_add_epi32(u[14], rnding);
+    u[14] = _mm_srai_epi32(u[14], bit);
+
+    x = _mm_mullo_epi32(u[15], cospim48);
+    u[15] = _mm_sub_epi32(y, x);
+    u[15] = _mm_add_epi32(u[15], rnding);
+    u[15] = _mm_srai_epi32(u[15], bit);
+
+    // stage 7
+    addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 8
+    y = _mm_mullo_epi32(u[2], cospi32);
+    x = _mm_mullo_epi32(u[3], cospi32);
+    u[2] = _mm_add_epi32(y, x);
+    u[2] = _mm_add_epi32(u[2], rnding);
+    u[2] = _mm_srai_epi32(u[2], bit);
+
+    u[3] = _mm_sub_epi32(y, x);
+    u[3] = _mm_add_epi32(u[3], rnding);
+    u[3] = _mm_srai_epi32(u[3], bit);
+    y = _mm_mullo_epi32(u[6], cospi32);
+    x = _mm_mullo_epi32(u[7], cospi32);
+    u[6] = _mm_add_epi32(y, x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
 
-  // Left-up quarter
-  assign_32x32_input_from_64x64(in, in32x32, 0);
-  write_buffer_32x32(in32x32, leftUp, stride, fliplr, flipud, shift, bd);
+    u[7] = _mm_sub_epi32(y, x);
+    u[7] = _mm_add_epi32(u[7], rnding);
+    u[7] = _mm_srai_epi32(u[7], bit);
 
-  // Right-up quarter
-  assign_32x32_input_from_64x64(in, in32x32, 64 / 2 / 4);
-  write_buffer_32x32(in32x32, rightUp, stride, fliplr, flipud, shift, bd);
+    y = _mm_mullo_epi32(u[10], cospi32);
+    x = _mm_mullo_epi32(u[11], cospi32);
+    u[10] = _mm_add_epi32(y, x);
+    u[10] = _mm_add_epi32(u[10], rnding);
+    u[10] = _mm_srai_epi32(u[10], bit);
+
+    u[11] = _mm_sub_epi32(y, x);
+    u[11] = _mm_add_epi32(u[11], rnding);
+    u[11] = _mm_srai_epi32(u[11], bit);
+
+    y = _mm_mullo_epi32(u[14], cospi32);
+    x = _mm_mullo_epi32(u[15], cospi32);
+    u[14] = _mm_add_epi32(y, x);
+    u[14] = _mm_add_epi32(u[14], rnding);
+    u[14] = _mm_srai_epi32(u[14], bit);
 
-  // Left-down quarter
-  assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4);
-  write_buffer_32x32(in32x32, leftDown, stride, fliplr, flipud, shift, bd);
+    u[15] = _mm_sub_epi32(y, x);
+    u[15] = _mm_add_epi32(u[15], rnding);
+    u[15] = _mm_srai_epi32(u[15], bit);
 
-  // Right-down quarter
-  assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4 + 64 / 2 / 4);
-  write_buffer_32x32(in32x32, rightDown, stride, fliplr, flipud, shift, bd);
+    // stage 9
+    if (do_cols) {
+      out[0] = u[0];
+      out[1] = _mm_sub_epi32(_mm_setzero_si128(), u[8]);
+      out[2] = u[12];
+      out[3] = _mm_sub_epi32(_mm_setzero_si128(), u[4]);
+      out[4] = u[6];
+      out[5] = _mm_sub_epi32(_mm_setzero_si128(), u[14]);
+      out[6] = u[10];
+      out[7] = _mm_sub_epi32(_mm_setzero_si128(), u[2]);
+      out[8] = u[3];
+      out[9] = _mm_sub_epi32(_mm_setzero_si128(), u[11]);
+      out[10] = u[15];
+      out[11] = _mm_sub_epi32(_mm_setzero_si128(), u[7]);
+      out[12] = u[5];
+      out[13] = _mm_sub_epi32(_mm_setzero_si128(), u[13]);
+      out[14] = u[9];
+      out[15] = _mm_sub_epi32(_mm_setzero_si128(), u[1]);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m128i clamp_hi_out =
+          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+      neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+    }
+  }
 }
 
-static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
                              int bd, int out_shift) {
-  int i, j;
   const int32_t *cospi = cospi_arr(bit);
-  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
-  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
-  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
-  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
-  int col;
-
-  const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
-  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
-  const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
-  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
-  const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
-  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
-  const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
-  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
-  const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
-  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
-  const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
-  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
-  const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
-  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
-  const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
-  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
-  const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
-  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
-  const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
-  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
-  const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
-  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
-  const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
-  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
-  const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
-  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
-  const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
-  const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
-  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
-  const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
-  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
-  const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
-  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
-  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
-  const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
-  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
-  const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
-  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
-  const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
-  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
-  const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
-  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
-  const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
-  const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
-  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
-  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
-  const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
-
-  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
-  const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
-  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
-  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
-  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
-  const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
-  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
-  const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
-  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
-  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
-  const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
-  const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
-  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
-  const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
-  const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
-  const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
-  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
-  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
-  const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
-  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
-  const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
-  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
-  const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
-  const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
-
-  for (col = 0; col < (do_cols ? 64 / 4 : 32 / 4); ++col) {
-    __m128i u[64], v[64];
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[16], v[16], x, y;
 
+  {
+    // stage 0
     // stage 1
-    u[32] = in[1 * 16 + col];
-    u[34] = in[17 * 16 + col];
-    u[36] = in[9 * 16 + col];
-    u[38] = in[25 * 16 + col];
-    u[40] = in[5 * 16 + col];
-    u[42] = in[21 * 16 + col];
-    u[44] = in[13 * 16 + col];
-    u[46] = in[29 * 16 + col];
-    u[48] = in[3 * 16 + col];
-    u[50] = in[19 * 16 + col];
-    u[52] = in[11 * 16 + col];
-    u[54] = in[27 * 16 + col];
-    u[56] = in[7 * 16 + col];
-    u[58] = in[23 * 16 + col];
-    u[60] = in[15 * 16 + col];
-    u[62] = in[31 * 16 + col];
-
-    v[16] = in[2 * 16 + col];
-    v[18] = in[18 * 16 + col];
-    v[20] = in[10 * 16 + col];
-    v[22] = in[26 * 16 + col];
-    v[24] = in[6 * 16 + col];
-    v[26] = in[22 * 16 + col];
-    v[28] = in[14 * 16 + col];
-    v[30] = in[30 * 16 + col];
-
-    u[8] = in[4 * 16 + col];
-    u[10] = in[20 * 16 + col];
-    u[12] = in[12 * 16 + col];
-    u[14] = in[28 * 16 + col];
-
-    v[4] = in[8 * 16 + col];
-    v[6] = in[24 * 16 + col];
-
-    u[0] = in[0 * 16 + col];
-    u[2] = in[16 * 16 + col];
+    u[0] = in[0];
+    u[1] = in[8];
+    u[2] = in[4];
+    u[3] = in[12];
+    u[4] = in[2];
+    u[5] = in[10];
+    u[6] = in[6];
+    u[7] = in[14];
+    u[8] = in[1];
+    u[9] = in[9];
+    u[10] = in[5];
+    u[11] = in[13];
+    u[12] = in[3];
+    u[13] = in[11];
+    u[14] = in[7];
+    u[15] = in[15];
 
     // stage 2
-    v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
-    v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
-    v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
-    v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
-    v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
-    v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
-    v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
-    v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
-    v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
-    v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
-    v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
-    v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
-    v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
-    v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
-    v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
-    v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
-    v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
-    v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
-    v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
-    v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
-    v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
-    v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
-    v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
-    v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
-    v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
-    v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
-    v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
-    v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
-    v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
-    v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
-    v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
-    v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
 
-    // stage 3
-    u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
-    u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
-    u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
+    v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
+    v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
+
+    // stage 3
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+    u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
+    u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
+    u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
+    u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
+    addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    x = _mm_mullo_epi32(u[0], cospi32);
+    y = _mm_mullo_epi32(u[1], cospi32);
+    v[0] = _mm_add_epi32(x, y);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_sub_epi32(x, y);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
+    v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
+    addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+    v[8] = u[8];
+    v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    v[11] = u[11];
+    v[12] = u[12];
+    v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    v[15] = u[15];
+
+    // stage 5
+    addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+    u[4] = v[4];
+
+    x = _mm_mullo_epi32(v[5], cospi32);
+    y = _mm_mullo_epi32(v[6], cospi32);
+    u[5] = _mm_sub_epi32(y, x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_add_epi32(y, x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[13], cospi32);
+    v[10] = _mm_sub_epi32(y, x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_add_epi32(x, y);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    x = _mm_mullo_epi32(u[11], cospi32);
+    y = _mm_mullo_epi32(u[12], cospi32);
+    v[11] = _mm_sub_epi32(y, x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_add_epi32(x, y);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 7
+    if (do_cols) {
+      addsub_no_clamp_sse4_1(v[0], v[15], out + 0, out + 15);
+      addsub_no_clamp_sse4_1(v[1], v[14], out + 1, out + 14);
+      addsub_no_clamp_sse4_1(v[2], v[13], out + 2, out + 13);
+      addsub_no_clamp_sse4_1(v[3], v[12], out + 3, out + 12);
+      addsub_no_clamp_sse4_1(v[4], v[11], out + 4, out + 11);
+      addsub_no_clamp_sse4_1(v[5], v[10], out + 5, out + 10);
+      addsub_no_clamp_sse4_1(v[6], v[9], out + 6, out + 9);
+      addsub_no_clamp_sse4_1(v[7], v[8], out + 7, out + 8);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+      addsub_shift_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+    }
+  }
+}
+
+static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[16], v[16], x, y;
+
+  // Calculate the column 0, 1, 2, 3
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    v[0] = _mm_mullo_epi32(in[15], cospi2);
+    x = _mm_mullo_epi32(in[0], cospi62);
+    v[0] = _mm_add_epi32(v[0], x);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_mullo_epi32(in[15], cospi62);
+    x = _mm_mullo_epi32(in[0], cospi2);
+    v[1] = _mm_sub_epi32(v[1], x);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = _mm_mullo_epi32(in[13], cospi10);
+    x = _mm_mullo_epi32(in[2], cospi54);
+    v[2] = _mm_add_epi32(v[2], x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_mullo_epi32(in[13], cospi54);
+    x = _mm_mullo_epi32(in[2], cospi10);
+    v[3] = _mm_sub_epi32(v[3], x);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = _mm_mullo_epi32(in[11], cospi18);
+    x = _mm_mullo_epi32(in[4], cospi46);
+    v[4] = _mm_add_epi32(v[4], x);
+    v[4] = _mm_add_epi32(v[4], rnding);
+    v[4] = _mm_srai_epi32(v[4], bit);
+
+    v[5] = _mm_mullo_epi32(in[11], cospi46);
+    x = _mm_mullo_epi32(in[4], cospi18);
+    v[5] = _mm_sub_epi32(v[5], x);
+    v[5] = _mm_add_epi32(v[5], rnding);
+    v[5] = _mm_srai_epi32(v[5], bit);
+
+    v[6] = _mm_mullo_epi32(in[9], cospi26);
+    x = _mm_mullo_epi32(in[6], cospi38);
+    v[6] = _mm_add_epi32(v[6], x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_mullo_epi32(in[9], cospi38);
+    x = _mm_mullo_epi32(in[6], cospi26);
+    v[7] = _mm_sub_epi32(v[7], x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = _mm_mullo_epi32(in[7], cospi34);
+    x = _mm_mullo_epi32(in[8], cospi30);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[9] = _mm_mullo_epi32(in[7], cospi30);
+    x = _mm_mullo_epi32(in[8], cospi34);
+    v[9] = _mm_sub_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[10] = _mm_mullo_epi32(in[5], cospi42);
+    x = _mm_mullo_epi32(in[10], cospi22);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_mullo_epi32(in[5], cospi22);
+    x = _mm_mullo_epi32(in[10], cospi42);
+    v[11] = _mm_sub_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(in[3], cospi50);
+    x = _mm_mullo_epi32(in[12], cospi14);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(in[3], cospi14);
+    x = _mm_mullo_epi32(in[12], cospi50);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(in[1], cospi58);
+    x = _mm_mullo_epi32(in[14], cospi6);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(in[1], cospi6);
+    x = _mm_mullo_epi32(in[14], cospi58);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 3
+    addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = _mm_mullo_epi32(u[8], cospi8);
+    x = _mm_mullo_epi32(u[9], cospi56);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[9] = _mm_mullo_epi32(u[8], cospi56);
+    x = _mm_mullo_epi32(u[9], cospi8);
+    v[9] = _mm_sub_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospi40);
+    x = _mm_mullo_epi32(u[11], cospi24);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_mullo_epi32(u[10], cospi24);
+    x = _mm_mullo_epi32(u[11], cospi40);
+    v[11] = _mm_sub_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[12], cospim56);
+    x = _mm_mullo_epi32(u[13], cospi8);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(u[12], cospi8);
+    x = _mm_mullo_epi32(u[13], cospim56);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(u[14], cospim24);
+    x = _mm_mullo_epi32(u[15], cospi40);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(u[14], cospi40);
+    x = _mm_mullo_epi32(u[15], cospim24);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 5
+    addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+
+    v[4] = _mm_mullo_epi32(u[4], cospi16);
+    x = _mm_mullo_epi32(u[5], cospi48);
+    v[4] = _mm_add_epi32(v[4], x);
+    v[4] = _mm_add_epi32(v[4], rnding);
+    v[4] = _mm_srai_epi32(v[4], bit);
+
+    v[5] = _mm_mullo_epi32(u[4], cospi48);
+    x = _mm_mullo_epi32(u[5], cospi16);
+    v[5] = _mm_sub_epi32(v[5], x);
+    v[5] = _mm_add_epi32(v[5], rnding);
+    v[5] = _mm_srai_epi32(v[5], bit);
+
+    v[6] = _mm_mullo_epi32(u[6], cospim48);
+    x = _mm_mullo_epi32(u[7], cospi16);
+    v[6] = _mm_add_epi32(v[6], x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_mullo_epi32(u[6], cospi16);
+    x = _mm_mullo_epi32(u[7], cospim48);
+    v[7] = _mm_sub_epi32(v[7], x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+
+    v[12] = _mm_mullo_epi32(u[12], cospi16);
+    x = _mm_mullo_epi32(u[13], cospi48);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(u[12], cospi48);
+    x = _mm_mullo_epi32(u[13], cospi16);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(u[14], cospim48);
+    x = _mm_mullo_epi32(u[15], cospi16);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(u[14], cospi16);
+    x = _mm_mullo_epi32(u[15], cospim48);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 7
+    addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 8
+    v[0] = u[0];
+    v[1] = u[1];
+
+    y = _mm_mullo_epi32(u[2], cospi32);
+    x = _mm_mullo_epi32(u[3], cospi32);
+    v[2] = _mm_add_epi32(y, x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_sub_epi32(y, x);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    y = _mm_mullo_epi32(u[6], cospi32);
+    x = _mm_mullo_epi32(u[7], cospi32);
+    v[6] = _mm_add_epi32(y, x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_sub_epi32(y, x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    y = _mm_mullo_epi32(u[10], cospi32);
+    x = _mm_mullo_epi32(u[11], cospi32);
+    v[10] = _mm_add_epi32(y, x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_sub_epi32(y, x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    y = _mm_mullo_epi32(u[14], cospi32);
+    x = _mm_mullo_epi32(u[15], cospi32);
+    v[14] = _mm_add_epi32(y, x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_sub_epi32(y, x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 9
+    if (do_cols) {
+      out[0] = v[0];
+      out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
+      out[2] = v[12];
+      out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
+      out[4] = v[6];
+      out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
+      out[6] = v[10];
+      out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
+      out[8] = v[3];
+      out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
+      out[10] = v[15];
+      out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
+      out[12] = v[5];
+      out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
+      out[14] = v[9];
+      out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m128i clamp_hi_out =
+          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+      neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+    }
+  }
+}
+
+static INLINE void idct64_stage8_sse4_1(
+    __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
+    const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
+    const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
+    const __m128i *rnding, int bit) {
+  int i;
+  __m128i temp1, temp2, temp3, temp4;
+  temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit);
+  u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit);
+  u[10] = temp1;
+  temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit);
+  u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit);
+  u[11] = temp2;
+
+  for (i = 16; i < 20; ++i) {
+    addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
+    addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo,
+                  clamp_hi);
+  }
+
+  temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit);
+  u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit);
+  u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit);
+  u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit);
+  u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit);
+  u[36] = temp1;
+  u[37] = temp2;
+  u[38] = temp3;
+  u[39] = temp4;
+
+  temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit);
+  u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit);
+  u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit);
+  u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit);
+  u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit);
+  u[40] = temp1;
+  u[41] = temp2;
+  u[42] = temp3;
+  u[43] = temp4;
+}
+
+static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32,
+                                        const __m128i *cospi32,
+                                        const __m128i *clamp_lo,
+                                        const __m128i *clamp_hi,
+                                        const __m128i *rnding, int bit) {
+  int i;
+  __m128i temp1, temp2, temp3, temp4;
+  for (i = 0; i < 8; ++i) {
+    addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
+  }
+
+  temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit);
+  u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit);
+  u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit);
+  u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit);
+  u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit);
+  u[20] = temp1;
+  u[21] = temp2;
+  u[22] = temp3;
+  u[23] = temp4;
+  for (i = 32; i < 40; i++) {
+    addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
+  }
+
+  for (i = 48; i < 56; i++) {
+    addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
+  }
+}
+
+static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
+                                         const __m128i *cospi32,
+                                         const __m128i *clamp_lo,
+                                         const __m128i *clamp_hi,
+                                         const __m128i *rnding, int bit) {
+  __m128i temp1, temp2, temp3, temp4;
+  for (int i = 0; i < 16; i++) {
+    addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
+  }
+
+  temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit);
+  u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit);
+  u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit);
+  u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit);
+  u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit);
+  u[40] = temp1;
+  u[41] = temp2;
+  u[42] = temp3;
+  u[43] = temp4;
+
+  temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit);
+  u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit);
+  u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit);
+  u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit);
+  u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit);
+  u[44] = temp1;
+  u[45] = temp2;
+  u[46] = temp3;
+  u[47] = temp4;
+}
+
+static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
+                                         int bd, int out_shift,
+                                         const int log_range) {
+  if (do_cols) {
+    for (int i = 0; i < 32; i++) {
+      addsub_no_clamp_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)]);
+    }
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    for (int i = 0; i < 32; i++) {
+      addsub_shift_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)],
+                          &clamp_lo_out, &clamp_hi_out, out_shift);
+    }
+  }
+}
+
+static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+
+  {
+    __m128i x;
+
+    // stage 1
+    // stage 2
+    // stage 3
+    // stage 4
+    // stage 5
+    // stage 6
+    x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit);
+
+    // stage 8
+    // stage 9
+    // stage 10
+    // stage 11
+    if (do_cols) {
+      x = _mm_max_epi32(x, clamp_lo);
+      x = _mm_min_epi32(x, clamp_hi);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+      __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+      x = _mm_add_epi32(x, offset);
+      x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+
+      x = _mm_max_epi32(x, clamp_lo_out);
+      x = _mm_min_epi32(x, clamp_hi_out);
+    }
+
+    out[0] = x;
+    out[63] = x;
+    out[1] = x;
+    out[62] = x;
+    out[2] = x;
+    out[61] = x;
+    out[3] = x;
+    out[60] = x;
+    out[4] = x;
+    out[59] = x;
+    out[5] = x;
+    out[58] = x;
+    out[6] = x;
+    out[57] = x;
+    out[7] = x;
+    out[56] = x;
+    out[8] = x;
+    out[55] = x;
+    out[9] = x;
+    out[54] = x;
+    out[10] = x;
+    out[53] = x;
+    out[11] = x;
+    out[52] = x;
+    out[12] = x;
+    out[51] = x;
+    out[13] = x;
+    out[50] = x;
+    out[14] = x;
+    out[49] = x;
+    out[15] = x;
+    out[48] = x;
+    out[16] = x;
+    out[47] = x;
+    out[17] = x;
+    out[46] = x;
+    out[18] = x;
+    out[45] = x;
+    out[19] = x;
+    out[44] = x;
+    out[20] = x;
+    out[43] = x;
+    out[21] = x;
+    out[42] = x;
+    out[22] = x;
+    out[41] = x;
+    out[23] = x;
+    out[40] = x;
+    out[24] = x;
+    out[39] = x;
+    out[25] = x;
+    out[38] = x;
+    out[26] = x;
+    out[37] = x;
+    out[27] = x;
+    out[36] = x;
+    out[28] = x;
+    out[35] = x;
+    out[29] = x;
+    out[34] = x;
+    out[30] = x;
+    out[33] = x;
+    out[31] = x;
+    out[32] = x;
+  }
+}
+
+static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+  const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+  const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+  const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+  const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+  const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+
+  {
+    __m128i u[64];
+
+    // stage 1
+    u[0] = in[0];
+    u[8] = in[4];
+    u[16] = in[2];
+    u[24] = in[6];
+    u[32] = in[1];
+    u[40] = in[5];
+    u[48] = in[3];
+    u[56] = in[7];
+
+    // stage 2
+    u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+    u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+    u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+    u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+    u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+    u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+    u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+    u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+
+    // stage 3
+    u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
+    u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
+    u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
+    u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
+    u[33] = u[32];
+    u[38] = u[39];
+    u[41] = u[40];
+    u[46] = u[47];
+    u[49] = u[48];
+    u[54] = u[55];
+    u[57] = u[56];
+    u[62] = u[63];
+
+    // stage 4
+    __m128i temp1, temp2;
+    u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+    u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+    u[17] = u[16];
+    u[22] = u[23];
+    u[25] = u[24];
+    u[30] = u[31];
+
+    temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+    u[33] = temp1;
+
+    temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    u[57] = temp2;
+
+    temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    u[41] = temp1;
+
+    temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    u[46] = temp2;
+
+    // stage 5
+    u[9] = u[8];
+    u[14] = u[15];
+
+    temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+    u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+    u[17] = temp1;
+
+    temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+    u[22] = temp2;
+
+    u[35] = u[32];
+    u[34] = u[33];
+    u[36] = u[39];
+    u[37] = u[38];
+    u[43] = u[40];
+    u[42] = u[41];
+    u[44] = u[47];
+    u[45] = u[46];
+    u[51] = u[48];
+    u[50] = u[49];
+    u[52] = u[55];
+    u[53] = u[54];
+    u[59] = u[56];
+    u[58] = u[57];
+    u[60] = u[63];
+    u[61] = u[62];
+
+    // stage 6
+    temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    u[0] = temp1;
+
+    temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    u[9] = temp2;
+    u[19] = u[16];
+    u[18] = u[17];
+    u[20] = u[23];
+    u[21] = u[22];
+    u[27] = u[24];
+    u[26] = u[25];
+    u[28] = u[31];
+    u[29] = u[30];
+
+    temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+    u[34] = temp1;
+    temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    u[35] = temp2;
+    temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    u[36] = temp1;
+    temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    u[37] = temp2;
+    temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    u[42] = temp1;
+    temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    u[43] = temp2;
+    temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    u[44] = temp1;
+    temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    u[45] = temp2;
+
+    // stage 7
+    u[3] = u[0];
+    u[2] = u[1];
+    u[11] = u[8];
+    u[10] = u[9];
+    u[12] = u[15];
+    u[13] = u[14];
+
+    temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+    u[18] = temp1;
+    temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+    u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+    u[19] = temp2;
+    temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+    u[20] = temp1;
+    temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+    u[21] = temp2;
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                      &clamp_hi);
+      }
+    }
+
+    // stage 8
+    u[7] = u[0];
+    u[6] = u[1];
+    u[5] = u[2];
+    u[4] = u[3];
+    u[9] = u[9];
+
+    idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                         &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+    // stage 9
+    idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                         bit);
+
+    // stage 10
+    idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                          bit);
+
+    // stage 11
+    idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
+  }
+}
+
+static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit,
+                                   int do_cols, int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
+  const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+
+  {
+    __m128i u[64];
+    __m128i tmp1, tmp2, tmp3, tmp4;
+    // stage 1
+    u[0] = in[0];
+    u[32] = in[1];
+    u[36] = in[9];
+    u[40] = in[5];
+    u[44] = in[13];
+    u[48] = in[3];
+    u[52] = in[11];
+    u[56] = in[7];
+    u[60] = in[15];
+    u[16] = in[2];
+    u[20] = in[10];
+    u[24] = in[6];
+    u[28] = in[14];
+    u[4] = in[8];
+    u[8] = in[4];
+    u[12] = in[12];
+
+    // stage 2
+    u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+    u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+    u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
+    u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
+    u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
+    u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
+    u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+    u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+    u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+    u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+    u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
+    u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
+    u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+    u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+    u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
+    u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
+
+    // stage 3
+    u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
+    u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
+    u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit);
+    u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit);
+    u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit);
+    u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit);
+    u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
+    u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
+    u[33] = u[32];
+    u[34] = u[35];
+    u[37] = u[36];
+    u[38] = u[39];
+    u[41] = u[40];
+    u[42] = u[43];
+    u[45] = u[44];
+    u[46] = u[47];
+    u[49] = u[48];
+    u[50] = u[51];
+    u[53] = u[52];
+    u[54] = u[55];
+    u[57] = u[56];
+    u[58] = u[59];
+    u[61] = u[60];
+    u[62] = u[63];
+
+    // stage 4
+    u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+    u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+    u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+    u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+
+    u[17] = u[16];
+    u[18] = u[19];
+    u[21] = u[20];
+    u[22] = u[23];
+    u[25] = u[24];
+    u[26] = u[27];
+    u[29] = u[28];
+    u[30] = u[31];
+
+    tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+    u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+    u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+    u[33] = tmp1;
+    u[34] = tmp2;
+    u[37] = tmp3;
+    u[38] = tmp4;
+
+    tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+    u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+    u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    u[41] = tmp1;
+    u[42] = tmp2;
+    u[45] = tmp3;
+    u[46] = tmp4;
+
+    // stage 5
+    u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+    u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+
+    u[9] = u[8];
+    u[10] = u[11];
+    u[13] = u[12];
+    u[14] = u[15];
+
+    tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
+    u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+    u[17] = tmp1;
+    u[18] = tmp2;
+    u[21] = tmp3;
+    u[22] = tmp4;
+
+    for (i = 32; i < 64; i += 8) {
+      addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    // stage 6
+    tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    u[0] = tmp1;
+    u[5] = u[4];
+    u[6] = u[7];
+
+    tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    u[9] = tmp1;
+    tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    u[10] = tmp2;
+
+    for (i = 16; i < 32; i += 8) {
+      addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+    u[34] = tmp1;
+    u[35] = tmp2;
+    u[36] = tmp3;
+    u[37] = tmp4;
+
+    tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    u[42] = tmp1;
+    u[43] = tmp2;
+    u[44] = tmp3;
+    u[45] = tmp4;
+
+    // stage 7
+    u[3] = u[0];
+    u[2] = u[1];
+    tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
+    u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
+    u[5] = tmp1;
+    addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+    u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+    u[18] = tmp1;
+    u[19] = tmp2;
+    u[20] = tmp3;
+    u[21] = tmp4;
+
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                      &clamp_hi);
+      }
+    }
+
+    // stage 8
+    for (i = 0; i < 4; ++i) {
+      addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
+    }
+
+    idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                         &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+    // stage 9
+    idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                         bit);
+
+    // stage 10
+    idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                          bit);
+
+    // stage 11
+    idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
+  }
+}
+
+static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
+  const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
+  const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+
+  {
+    __m128i u[64], v[64];
+
+    // stage 1
+    u[32] = in[1];
+    u[34] = in[17];
+    u[36] = in[9];
+    u[38] = in[25];
+    u[40] = in[5];
+    u[42] = in[21];
+    u[44] = in[13];
+    u[46] = in[29];
+    u[48] = in[3];
+    u[50] = in[19];
+    u[52] = in[11];
+    u[54] = in[27];
+    u[56] = in[7];
+    u[58] = in[23];
+    u[60] = in[15];
+    u[62] = in[31];
+
+    v[16] = in[2];
+    v[18] = in[18];
+    v[20] = in[10];
+    v[22] = in[26];
+    v[24] = in[6];
+    v[26] = in[22];
+    v[28] = in[14];
+    v[30] = in[30];
+
+    u[8] = in[4];
+    u[10] = in[20];
+    u[12] = in[12];
+    u[14] = in[28];
+
+    v[4] = in[8];
+    v[6] = in[24];
+
+    u[0] = in[0];
+    u[2] = in[16];
+
+    // stage 2
+    v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+    v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
+    v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
+    v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
+    v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
+    v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
+    v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
+    v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+    v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+    v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
+    v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
+    v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
+    v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
+    v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
+    v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
+    v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+    v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+    v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
+    v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
+    v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
+    v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
+    v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
+    v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
+    v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+    v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+    v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
+    v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
+    v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
+    v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
+    v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
+    v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
+    v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+
+    // stage 3
+    u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
+    u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
+    u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
     u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit);
     u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit);
     u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit);
@@ -2039,301 +3961,1388 @@ static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
     v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
     v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
 
-    for (i = 16; i < 32; i += 4) {
-      addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
-                    &clamp_hi);
-      addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
-                    &clamp_hi);
+    for (i = 16; i < 32; i += 4) {
+      addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+    v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+    v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+    v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+    v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+    v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+    v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+    v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+
+    // stage 5
+    u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
+    u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
+    u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
+    u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
+
+    for (i = 8; i < 16; i += 4) {
+      addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 16; i < 32; i += 4) {
+      u[i + 0] = v[i + 0];
+      u[i + 3] = v[i + 3];
+    }
+
+    u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
+    u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
+    u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
+    u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
+    u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
+
+    for (i = 32; i < 64; i += 8) {
+      addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    // stage 6
+    v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+    v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+
+    addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+
+    for (i = 8; i < 16; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+
+    for (i = 16; i < 32; i += 8) {
+      addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 8) {
+      v[i + 0] = u[i + 0];
+      v[i + 1] = u[i + 1];
+      v[i + 6] = u[i + 6];
+      v[i + 7] = u[i + 7];
+    }
+
+    v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+
+    // stage 7
+    addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+    u[4] = v[4];
+    u[7] = v[7];
+    u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
+    u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
+
+    addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    for (i = 16; i < 32; i += 8) {
+      u[i + 0] = v[i + 0];
+      u[i + 1] = v[i + 1];
+      u[i + 6] = v[i + 6];
+      u[i + 7] = v[i + 7];
+    }
+
+    u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
+    u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
+    u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
+    u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
+    u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
+
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                      &clamp_hi);
+      }
+    }
+
+    // stage 8
+    for (i = 0; i < 4; ++i) {
+      addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
+    }
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[14] = u[14];
+    v[15] = u[15];
+
+    v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
+
+    for (i = 16; i < 20; ++i) {
+      addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
+      addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 32; i < 36; ++i) {
+      v[i] = u[i];
+      v[i + 12] = u[i + 12];
+      v[i + 16] = u[i + 16];
+      v[i + 28] = u[i + 28];
+    }
+
+    v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
+    v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
+    v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
+    v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
+    v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
+    v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
+    v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
+    v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
+    v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
+    v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
+    v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
+    v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
+    v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
+    v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
+
+    // stage 9
+    for (i = 0; i < 8; ++i) {
+      addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 16; i < 20; ++i) {
+      u[i] = v[i];
+      u[i + 12] = v[i + 12];
+    }
+
+    u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
+    u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
+    u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
+    u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
+    u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
+
+    for (i = 32; i < 40; i++) {
+      addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
     }
 
-    for (i = 32; i < 64; i += 4) {
-      v[i + 0] = u[i + 0];
-      v[i + 3] = u[i + 3];
+    for (i = 48; i < 56; i++) {
+      addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
     }
 
-    v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
-    v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
-    v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
-    v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
-    v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
-    v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
-    v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
-    v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
-    v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
-    v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
-    v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
-    v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
-    v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
-    v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
-    v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
-    v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+    // stage 10
+    for (i = 0; i < 16; i++) {
+      addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
+    }
 
-    // stage 5
-    u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
-    u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
-    u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
-    u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
+    for (i = 32; i < 40; i++) v[i] = u[i];
 
-    for (i = 8; i < 16; i += 4) {
-      addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
-                    &clamp_hi);
-      addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
-                    &clamp_hi);
-    }
+    v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
+    v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
+    v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
+    v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
+    v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
+    v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
+    v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
+    v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
+    v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
+    v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
+    v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
+    v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
+    v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
+    v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
 
-    for (i = 16; i < 32; i += 4) {
-      u[i + 0] = v[i + 0];
-      u[i + 3] = v[i + 3];
+    for (i = 56; i < 64; i++) v[i] = u[i];
+
+    // stage 11
+    if (do_cols) {
+      for (i = 0; i < 32; i++) {
+        addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)]);
+      }
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+      for (i = 0; i < 32; i++) {
+        addsub_shift_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)],
+                            &clamp_lo_out, &clamp_hi_out, out_shift);
+      }
     }
+  }
+}
 
-    u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
-    u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
-    u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
-    u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
-    u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
-    u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
-    u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
-    u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
+static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i bf1;
 
-    for (i = 32; i < 64; i += 8) {
-      addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
-                    &clamp_hi);
-      addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
-                    &clamp_hi);
+  // stage 0
+  // stage 1
+  bf1 = in[0];
 
-      addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
-                    &clamp_hi);
-      addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
-                    &clamp_hi);
-    }
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit);
 
-    // stage 6
-    v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
-    v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
-    v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
-    v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+  if (do_cols) {
+    bf1 = _mm_max_epi32(bf1, clamp_lo);
+    bf1 = _mm_min_epi32(bf1, clamp_hi);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+    bf1 = _mm_add_epi32(bf1, offset);
+    bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
+    bf1 = _mm_max_epi32(bf1, clamp_lo_out);
+    bf1 = _mm_min_epi32(bf1, clamp_hi_out);
+  }
+  out[0] = bf1;
+  out[1] = bf1;
+  out[2] = bf1;
+  out[3] = bf1;
+  out[4] = bf1;
+  out[5] = bf1;
+  out[6] = bf1;
+  out[7] = bf1;
+  out[8] = bf1;
+  out[9] = bf1;
+  out[10] = bf1;
+  out[11] = bf1;
+  out[12] = bf1;
+  out[13] = bf1;
+  out[14] = bf1;
+  out[15] = bf1;
+  out[16] = bf1;
+  out[17] = bf1;
+  out[18] = bf1;
+  out[19] = bf1;
+  out[20] = bf1;
+  out[21] = bf1;
+  out[22] = bf1;
+  out[23] = bf1;
+  out[24] = bf1;
+  out[25] = bf1;
+  out[26] = bf1;
+  out[27] = bf1;
+  out[28] = bf1;
+  out[29] = bf1;
+  out[30] = bf1;
+  out[31] = bf1;
+}
 
-    addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i bf1[32];
 
-    for (i = 8; i < 16; i += 4) {
-      v[i + 0] = u[i + 0];
-      v[i + 3] = u[i + 3];
-    }
+  // stage 0
+  // stage 1
+  bf1[0] = in[0];
+  bf1[4] = in[4];
+  bf1[8] = in[2];
+  bf1[12] = in[6];
+  bf1[16] = in[1];
+  bf1[20] = in[5];
+  bf1[24] = in[3];
+  bf1[28] = in[7];
 
-    v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
-    v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
-    v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
-    v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+  // stage 2
+  bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
+  bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
+  bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
+  bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
+  bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
+  bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
+  bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
+  bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
 
-    for (i = 16; i < 32; i += 8) {
-      addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
-                    &clamp_hi);
-      addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
-                    &clamp_hi);
+  // stage 3
+  bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
+  bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
+
+  bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
+  bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
+  bf1[17] = bf1[16];
+  bf1[18] = bf1[19];
+  bf1[21] = bf1[20];
+  bf1[22] = bf1[23];
+  bf1[25] = bf1[24];
+  bf1[26] = bf1[27];
+  bf1[29] = bf1[28];
+  bf1[30] = bf1[31];
+
+  // stage 4 :
+  bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
+  bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
+
+  bf1[9] = bf1[8];
+  bf1[10] = bf1[11];
+  bf1[13] = bf1[12];
+  bf1[14] = bf1[15];
+
+  idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+                       &cospi24, &cospi40, &cospim24, &rounding, bit);
 
-      addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
-                    &clamp_hi);
-      addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
-                    &clamp_hi);
-    }
+  // stage 5
+  bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
+  bf1[1] = bf1[0];
+  bf1[5] = bf1[4];
+  bf1[6] = bf1[7];
 
-    for (i = 32; i < 64; i += 8) {
-      v[i + 0] = u[i + 0];
-      v[i + 1] = u[i + 1];
-      v[i + 6] = u[i + 6];
-      v[i + 7] = u[i + 7];
-    }
+  idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+                       &clamp_hi, &rounding, bit);
 
-    v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
-    v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
-    v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
-    v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
-    v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
-    v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
-    v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
-    v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
-    v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
-    v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
-    v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
-    v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
-    v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
-    v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
-    v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
-    v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+  // stage 6
+  bf1[3] = bf1[0];
+  bf1[2] = bf1[1];
 
-    // stage 7
-    addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+  idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+  // stage 7
+  idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+  // stage 8
+  idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+  // stage 9
+  idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
+}
+
+static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
+                                   int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i bf1[32];
+
+  // stage 0
+  // stage 1
+
+  bf1[0] = in[0];
+  bf1[2] = in[8];
+  bf1[4] = in[4];
+  bf1[6] = in[12];
+  bf1[8] = in[2];
+  bf1[10] = in[10];
+  bf1[12] = in[6];
+  bf1[14] = in[14];
+  bf1[16] = in[1];
+  bf1[18] = in[9];
+  bf1[20] = in[5];
+  bf1[22] = in[13];
+  bf1[24] = in[3];
+  bf1[26] = in[11];
+  bf1[28] = in[7];
+  bf1[30] = in[15];
+
+  // stage 2
+  bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
+  bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
+  bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit);
+  bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit);
+  bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit);
+  bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit);
+  bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
+  bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
+  bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
+  bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
+  bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit);
+  bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit);
+  bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit);
+  bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit);
+  bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
+  bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
+
+  // stage 3
+  bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
+  bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
+  bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit);
+  bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit);
+  bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit);
+  bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit);
+  bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
+  bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
+
+  addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+  // stage 4
+  bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
+  bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
+  bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit);
+  bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit);
+
+  addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+  idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+                       &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+  // stage 5
+  bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
+  bf1[1] = bf1[0];
+  bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit);
+  bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit);
 
-    u[4] = v[4];
-    u[7] = v[7];
-    u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
-    u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
+  addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
 
-    addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+  idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+                       &clamp_hi, &rounding, bit);
 
-    for (i = 16; i < 32; i += 8) {
-      u[i + 0] = v[i + 0];
-      u[i + 1] = v[i + 1];
-      u[i + 6] = v[i + 6];
-      u[i + 7] = v[i + 7];
-    }
+  // stage 6
+  addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
 
-    u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
-    u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
-    u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
-    u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
-    u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
-    u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
-    u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
-    u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
+  idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
 
-    for (i = 32; i < 64; i += 16) {
-      for (j = i; j < i + 4; j++) {
-        addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
-        addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
-                      &clamp_hi);
-      }
-    }
+  // stage 7
+  idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
 
-    // stage 8
-    for (i = 0; i < 4; ++i) {
-      addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
-    }
+  // stage 8
+  idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
 
-    v[8] = u[8];
-    v[9] = u[9];
-    v[14] = u[14];
-    v[15] = u[15];
+  // stage 9
+  idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
+}
 
-    v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
-    v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
-    v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
-    v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
+static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i bf1[32], bf0[32];
 
-    for (i = 16; i < 20; ++i) {
-      addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
-      addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
-                    &clamp_hi);
-    }
+  // stage 0
+  // stage 1
+  bf1[0] = in[0];
+  bf1[1] = in[16];
+  bf1[2] = in[8];
+  bf1[3] = in[24];
+  bf1[4] = in[4];
+  bf1[5] = in[20];
+  bf1[6] = in[12];
+  bf1[7] = in[28];
+  bf1[8] = in[2];
+  bf1[9] = in[18];
+  bf1[10] = in[10];
+  bf1[11] = in[26];
+  bf1[12] = in[6];
+  bf1[13] = in[22];
+  bf1[14] = in[14];
+  bf1[15] = in[30];
+  bf1[16] = in[1];
+  bf1[17] = in[17];
+  bf1[18] = in[9];
+  bf1[19] = in[25];
+  bf1[20] = in[5];
+  bf1[21] = in[21];
+  bf1[22] = in[13];
+  bf1[23] = in[29];
+  bf1[24] = in[3];
+  bf1[25] = in[19];
+  bf1[26] = in[11];
+  bf1[27] = in[27];
+  bf1[28] = in[7];
+  bf1[29] = in[23];
+  bf1[30] = in[15];
+  bf1[31] = in[31];
 
-    for (i = 32; i < 36; ++i) {
-      v[i] = u[i];
-      v[i + 12] = u[i + 12];
-      v[i + 16] = u[i + 16];
-      v[i + 28] = u[i + 28];
-    }
+  // stage 2
+  bf0[0] = bf1[0];
+  bf0[1] = bf1[1];
+  bf0[2] = bf1[2];
+  bf0[3] = bf1[3];
+  bf0[4] = bf1[4];
+  bf0[5] = bf1[5];
+  bf0[6] = bf1[6];
+  bf0[7] = bf1[7];
+  bf0[8] = bf1[8];
+  bf0[9] = bf1[9];
+  bf0[10] = bf1[10];
+  bf0[11] = bf1[11];
+  bf0[12] = bf1[12];
+  bf0[13] = bf1[13];
+  bf0[14] = bf1[14];
+  bf0[15] = bf1[15];
+  bf0[16] =
+      half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
+  bf0[17] =
+      half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
+  bf0[18] =
+      half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
+  bf0[19] =
+      half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
+  bf0[20] =
+      half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
+  bf0[21] =
+      half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
+  bf0[22] =
+      half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
+  bf0[23] =
+      half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
+  bf0[24] =
+      half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
+  bf0[25] =
+      half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
+  bf0[26] =
+      half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
+  bf0[27] =
+      half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
+  bf0[28] =
+      half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
+  bf0[29] =
+      half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
+  bf0[30] =
+      half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
+  bf0[31] =
+      half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
 
-    v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
-    v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
-    v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
-    v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
-    v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
-    v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
-    v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
-    v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
-    v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
-    v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
-    v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
-    v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
-    v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
-    v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
-    v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
-    v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
+  // stage 3
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] =
+      half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
+  bf1[9] =
+      half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
+  bf1[10] =
+      half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
+  bf1[11] =
+      half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
+  bf1[12] =
+      half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
+  bf1[13] =
+      half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
+  bf1[14] =
+      half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
+  bf1[15] =
+      half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
+
+  addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
 
-    // stage 9
-    for (i = 0; i < 8; ++i) {
-      addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
-    }
+  // stage 4
+  bf0[0] = bf1[0];
+  bf0[1] = bf1[1];
+  bf0[2] = bf1[2];
+  bf0[3] = bf1[3];
+  bf0[4] =
+      half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
+  bf0[5] =
+      half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
+  bf0[6] =
+      half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
+  bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
+
+  addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
+  bf0[16] = bf1[16];
+  bf0[17] =
+      half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
+  bf0[18] =
+      half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
+  bf0[19] = bf1[19];
+  bf0[20] = bf1[20];
+  bf0[21] =
+      half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
+  bf0[22] =
+      half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
+  bf0[23] = bf1[23];
+  bf0[24] = bf1[24];
+  bf0[25] =
+      half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
+  bf0[26] =
+      half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
+  bf0[27] = bf1[27];
+  bf0[28] = bf1[28];
+  bf0[29] =
+      half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
+  bf0[30] =
+      half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
+  bf0[31] = bf1[31];
 
-    for (i = 16; i < 20; ++i) {
-      u[i] = v[i];
-      u[i + 12] = v[i + 12];
-    }
+  // stage 5
+  bf1[0] =
+      half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
+  bf1[1] =
+      half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
+  bf1[2] =
+      half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
+  bf1[3] =
+      half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
+  addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+  bf1[8] = bf0[8];
+  bf1[9] =
+      half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
+  bf1[10] =
+      half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] =
+      half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
+  bf1[14] =
+      half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
+  bf1[15] = bf0[15];
+  addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
 
-    u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
-    u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
-    u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
-    u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
-    u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
-    u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
-    u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
-    u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
+  // stage 6
+  addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
+  bf0[4] = bf1[4];
+  bf0[5] =
+      half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+  bf0[6] =
+      half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+  bf0[7] = bf1[7];
+  addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
+  bf0[16] = bf1[16];
+  bf0[17] = bf1[17];
+  bf0[18] =
+      half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
+  bf0[19] =
+      half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
+  bf0[20] =
+      half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
+  bf0[21] =
+      half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
+  bf0[22] = bf1[22];
+  bf0[23] = bf1[23];
+  bf0[24] = bf1[24];
+  bf0[25] = bf1[25];
+  bf0[26] =
+      half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
+  bf0[27] =
+      half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
+  bf0[28] =
+      half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
+  bf0[29] =
+      half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
+  bf0[30] = bf1[30];
+  bf0[31] = bf1[31];
 
-    for (i = 32; i < 40; i++) {
-      addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
-    }
+  // stage 7
+  addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] =
+      half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+  bf1[11] =
+      half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+  bf1[12] =
+      half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+  bf1[13] =
+      half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
+
+  // stage 8
+  addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
+  bf0[16] = bf1[16];
+  bf0[17] = bf1[17];
+  bf0[18] = bf1[18];
+  bf0[19] = bf1[19];
+  bf0[20] =
+      half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+  bf0[21] =
+      half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+  bf0[22] =
+      half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+  bf0[23] =
+      half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+  bf0[24] =
+      half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+  bf0[25] =
+      half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+  bf0[26] =
+      half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+  bf0[27] =
+      half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+  bf0[28] = bf1[28];
+  bf0[29] = bf1[29];
+  bf0[30] = bf1[30];
+  bf0[31] = bf1[31];
+
+  // stage 9
+  if (do_cols) {
+    addsub_no_clamp_sse4_1(bf0[0], bf0[31], out + 0, out + 31);
+    addsub_no_clamp_sse4_1(bf0[1], bf0[30], out + 1, out + 30);
+    addsub_no_clamp_sse4_1(bf0[2], bf0[29], out + 2, out + 29);
+    addsub_no_clamp_sse4_1(bf0[3], bf0[28], out + 3, out + 28);
+    addsub_no_clamp_sse4_1(bf0[4], bf0[27], out + 4, out + 27);
+    addsub_no_clamp_sse4_1(bf0[5], bf0[26], out + 5, out + 26);
+    addsub_no_clamp_sse4_1(bf0[6], bf0[25], out + 6, out + 25);
+    addsub_no_clamp_sse4_1(bf0[7], bf0[24], out + 7, out + 24);
+    addsub_no_clamp_sse4_1(bf0[8], bf0[23], out + 8, out + 23);
+    addsub_no_clamp_sse4_1(bf0[9], bf0[22], out + 9, out + 22);
+    addsub_no_clamp_sse4_1(bf0[10], bf0[21], out + 10, out + 21);
+    addsub_no_clamp_sse4_1(bf0[11], bf0[20], out + 11, out + 20);
+    addsub_no_clamp_sse4_1(bf0[12], bf0[19], out + 12, out + 19);
+    addsub_no_clamp_sse4_1(bf0[13], bf0[18], out + 13, out + 18);
+    addsub_no_clamp_sse4_1(bf0[14], bf0[17], out + 14, out + 17);
+    addsub_no_clamp_sse4_1(bf0[15], bf0[16], out + 15, out + 16);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    addsub_shift_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+  }
+}
 
-    for (i = 48; i < 56; i++) {
-      addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
-    }
+void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+      // Assembly version doesn't support some transform types, so use C version
+      // for those.
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                               bd);
+      break;
+    default:
+      av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                    tx_type, bd);
+      break;
+  }
+}
 
-    // stage 10
-    for (i = 0; i < 16; i++) {
-      addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
-    }
+void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                         int stride,
+                                         const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+      // Assembly version doesn't support some transform types, so use C version
+      // for those.
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                txfm_param->tx_type, txfm_param->bd);
+      break;
+    default:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+                                                txfm_param->tx_size,
+                                                txfm_param->eob, bd);
+      break;
+  }
+}
 
-    for (i = 32; i < 40; i++) v[i] = u[i];
+void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                         int stride,
+                                         const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+      // Assembly version doesn't support some transform types, so use C version
+      // for those.
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                txfm_param->tx_type, txfm_param->bd);
+      break;
+    default:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+                                                txfm_param->tx_size,
+                                                txfm_param->eob, bd);
+      break;
+  }
+}
 
-    v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
-    v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
-    v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
-    v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
-    v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
-    v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
-    v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
-    v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
-    v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
-    v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
-    v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
-    v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
-    v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
-    v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
-    v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
-    v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
+void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *input,
+                                          uint8_t *dest, int stride,
+                                          const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+      // Assembly version doesn't support some transform types, so use C version
+      // for those.
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                 tx_type, bd);
+      break;
+    default:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+                                                txfm_param->tx_size,
+                                                txfm_param->eob, bd);
+      break;
+  }
+}
 
-    for (i = 56; i < 64; i++) v[i] = u[i];
+void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *input,
+                                          uint8_t *dest, int stride,
+                                          const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+                                                txfm_param->tx_size,
+                                                txfm_param->eob, bd);
+      break;
+      // Assembly version doesn't support IDTX, so use C version for it.
+    case IDTX:
+      av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                 tx_type, bd);
+      break;
+    default: assert(0);
+  }
+}
 
-    // stage 11
-    if (do_cols) {
-      for (i = 0; i < 32; i++) {
-        addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[16 * (i) + col],
-                               &out[16 * (63 - i) + col]);
+void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  int eob = txfm_param->eob;
+  int bd = txfm_param->bd;
+  int lossless = txfm_param->lossless;
+  const int32_t *src = cast_to_int32(input);
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+    return;
+  }
+  switch (tx_type) {
+      // Assembly version doesn't support some transform types, so use C version
+      // for those.
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                               bd);
+      break;
+    default:
+      av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                    tx_type, bd);
+      break;
+  }
+}
+
+static const transform_1d_sse4_1
+    highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
+        { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      {
+          { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
+            NULL },
+          { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
+            NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
+          idct32x32_sse4_1 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
+          idct64x64_sse4_1 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
+                                                    uint16_t *output,
+                                                    int stride, TX_TYPE tx_type,
+                                                    TX_SIZE tx_size, int eob,
+                                                    const int bd) {
+  __m128i buf1[64 * 16];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 2;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
+    __m128i buf0[64];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
+      __m128i *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_sse4_1(
+          buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+    __m128i *_buf1 = buf1 + i * 4;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                      buf0[4 * j],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
       }
     } else {
-      for (i = 0; i < 32; i++) {
-        addsub_shift_sse4_1(v[i], v[63 - i], &out[16 * (i) + col],
-                            &out[16 * (63 - i) + col], &clamp_lo, &clamp_hi,
-                            out_shift);
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(
+            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
       }
     }
   }
-}
+  // 2nd stage: column transform
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+                                    buf1 + i * txfm_size_row, txfm_size_row,
+                                    -shift[1]);
+  }
 
-void av1_inv_txfm2d_add_64x64_sse4_1(const int32_t *coeff, uint16_t *output,
-                                     int stride, TX_TYPE tx_type, int bd) {
-  __m128i in[64 * 64 / 4], out[64 * 64 / 4];
-  const int8_t *shift = inv_txfm_shift_ls[TX_64X64];
-  const int txw_idx = tx_size_wide_log2[TX_64X64] - tx_size_wide_log2[0];
-  const int txh_idx = tx_size_high_log2[TX_64X64] - tx_size_high_log2[0];
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+                                     output + 8 * i, stride, ud_flip,
+                                     txfm_size_row, bd);
+    }
+  }
+}
 
+void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
+                                               uint8_t *output, int stride,
+                                               TX_TYPE tx_type, TX_SIZE tx_size,
+                                               int eob, const int bd) {
   switch (tx_type) {
     case DCT_DCT:
-      load_buffer_64x64_lower_32x32(coeff, in);
-      transpose_64x64(in, out, 0);
-      idct64x64_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                       -shift[0]);
-      transpose_64x64(in, out, 1);
-      idct64x64_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_64x64(in, output, stride, 0, 0, -shift[1], bd);
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      highbd_inv_txfm2d_add_no_identity_sse41(
+          input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+          bd);
       break;
+    default: assert(0); break;
+  }
+}
 
-    default:
-      av1_inv_txfm2d_add_64x64_c(coeff, output, stride, tx_type, bd);
+void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  switch (tx_size) {
+    case TX_32X32:
+      av1_highbd_inv_txfm_add_32x32_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X16:
+      av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_8X8:
+      av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_4X8:
+      av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+      break;
+    case TX_8X4:
+      av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+      break;
+    case TX_8X16:
+      av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X8:
+      av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X32:
+      av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+      break;
+    case TX_32X16:
+      av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+      break;
+    case TX_32X64:
+      av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+      break;
+    case TX_64X32:
+      av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+      break;
+    case TX_4X4:
+      av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X4:
+      av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+      break;
+    case TX_4X16:
+      av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+      break;
+    case TX_8X32:
+      av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+      break;
+    case TX_32X8:
+      av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+      break;
+    case TX_64X64:
+    case TX_16X64:
+    case TX_64X16:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(
+          input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
+          txfm_param->eob, txfm_param->bd);
       break;
+    default: assert(0 && "Invalid transform size"); break;
   }
 }
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
index 608bd88a4..e298cf653 100644
--- a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -14,7 +14,6 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-#include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_dsp/x86/convolve_common_intrin.h"
 #include "aom_dsp/x86/convolve_sse4_1.h"
diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
index b29bd1d79..6f24e5948 100644
--- a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
+++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef _HIGHBD_TXFM_UTILITY_SSE4_H
-#define _HIGHBD_TXFM_UTILITY_SSE4_H
+#ifndef AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
+#define AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
 
 #include <smmintrin.h> /* SSE4.1 */
 
@@ -75,6 +75,17 @@ static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
                 out[63]);
 }
 
+static INLINE void transpose_32x32(const __m128i *input, __m128i *output) {
+  for (int j = 0; j < 8; j++) {
+    for (int i = 0; i < 8; i++) {
+      TRANSPOSE_4X4(input[i * 32 + j + 0], input[i * 32 + j + 8],
+                    input[i * 32 + j + 16], input[i * 32 + j + 24],
+                    output[j * 32 + i + 0], output[j * 32 + i + 8],
+                    output[j * 32 + i + 16], output[j * 32 + i + 24]);
+    }
+  }
+}
+
 // Note:
 //  rounding = 1 << (bit - 1)
 static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
@@ -100,4 +111,15 @@ static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0,
   return x;
 }
 
-#endif  // _HIGHBD_TXFM_UTILITY_SSE4_H
+typedef void (*transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit,
+                                    int do_cols, int bd, int out_shift);
+
+typedef void (*fwd_transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit,
+                                        const int num_cols);
+
+void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
+                                               uint8_t *output, int stride,
+                                               TX_TYPE tx_type, TX_SIZE tx_size,
+                                               int eob, const int bd);
+
+#endif  // AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
index a08beaafd..4bcab0564 100644
--- a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
@@ -19,10 +19,21 @@ static const uint8_t warp_highbd_arrange_bytes[16] = {
   0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
 };
 
-static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp,
-                                     int sx, int alpha, int k,
-                                     const int offset_bits_horiz,
-                                     const int reduce_bits_horiz) {
+static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
+  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+};
+static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
+  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
+};
+static const uint8_t highbd_shuffle_alpha0_mask2[16] = {
+  8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
+};
+static const uint8_t highbd_shuffle_alpha0_mask3[16] = {
+  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
+};
+
+static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
+                                                          __m128i *coeff) {
   // Filter even-index pixels
   const __m128i tmp_0 = _mm_loadu_si128(
       (__m128i *)(warped_filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
@@ -43,27 +54,13 @@ static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp,
   const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
 
   // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
-  const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+  coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
   // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
-  const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+  coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10);
   // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
-  const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+  coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14);
   // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
-  const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
-  const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
-                                             ((1 << reduce_bits_horiz) >> 1));
-
-  // Calculate filtered results
-  const __m128i res_0 = _mm_madd_epi16(src, coeff_0);
-  const __m128i res_2 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2);
-  const __m128i res_4 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4);
-  const __m128i res_6 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6);
-
-  __m128i res_even =
-      _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
-  res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
-                           _mm_cvtsi32_si128(reduce_bits_horiz));
+  coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
 
   // Filter odd-index pixels
   const __m128i tmp_1 = _mm_loadu_si128(
@@ -80,15 +77,63 @@ static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp,
   const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
   const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
 
-  const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
-  const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
-  const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
-  const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+  coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11);
+  coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11);
+  coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+  coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0(
+    int sx, __m128i *coeff) {
+  // Filter coeff
+  const __m128i tmp_0 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+  coeff[0] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
+  coeff[2] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1));
+  coeff[4] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2));
+  coeff[6] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3));
+
+  coeff[1] = coeff[0];
+  coeff[3] = coeff[2];
+  coeff[5] = coeff[4];
+  coeff[7] = coeff[6];
+}
+
+static INLINE void highbd_filter_src_pixels(
+    const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff,
+    const int offset_bits_horiz, const int reduce_bits_horiz, int k) {
+  const __m128i src_1 = *src;
+  const __m128i src2_1 = *src2;
 
-  const __m128i res_1 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1);
-  const __m128i res_3 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3);
-  const __m128i res_5 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5);
-  const __m128i res_7 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7);
+  const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
+                                             ((1 << reduce_bits_horiz) >> 1));
+
+  const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]);
+  const __m128i res_2 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]);
+  const __m128i res_4 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]);
+  const __m128i res_6 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]);
+
+  __m128i res_even =
+      _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
+  res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
+                           _mm_cvtsi32_si128(reduce_bits_horiz));
+
+  const __m128i res_1 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]);
+  const __m128i res_3 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]);
+  const __m128i res_5 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]);
+  const __m128i res_7 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]);
 
   __m128i res_odd =
       _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
@@ -101,6 +146,145 @@ static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp,
   tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
 }
 
+static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
+                                       __m128i *tmp, int sx, int alpha, int k,
+                                       const int offset_bits_horiz,
+                                       const int reduce_bits_horiz) {
+  __m128i coeff[8];
+  highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff);
+  highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz,
+                           reduce_bits_horiz, k);
+}
+
+static INLINE void highbd_warp_horizontal_filter_alpha0_beta0(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)beta;
+  (void)alpha;
+  int k;
+
+  __m128i coeff[8];
+  highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
+
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m128i src2 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+    highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+                             reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void highbd_warp_horizontal_filter_alpha0(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)alpha;
+  int k;
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+    int sx = sx4 + beta * (k + 4);
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m128i src2 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+    __m128i coeff[8];
+    highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff);
+    highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+                             reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void highbd_warp_horizontal_filter_beta0(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)beta;
+  int k;
+  __m128i coeff[8];
+  highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff);
+
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m128i src2 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+    highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+                             reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void highbd_warp_horizontal_filter(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  int k;
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+    int sx = sx4 + beta * (k + 4);
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m128i src2 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+    highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz,
+                        reduce_bits_horiz);
+  }
+}
+
+static INLINE void highbd_prepare_warp_horizontal_filter(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  if (alpha == 0 && beta == 0)
+    highbd_warp_horizontal_filter_alpha0_beta0(
+        ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+        offset_bits_horiz, reduce_bits_horiz);
+
+  else if (alpha == 0 && beta != 0)
+    highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+                                         beta, p_height, height, i,
+                                         offset_bits_horiz, reduce_bits_horiz);
+
+  else if (alpha != 0 && beta == 0)
+    highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+                                        beta, p_height, height, i,
+                                        offset_bits_horiz, reduce_bits_horiz);
+  else
+    highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+                                  p_height, height, i, offset_bits_horiz,
+                                  reduce_bits_horiz);
+}
+
 void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
                                    int width, int height, int stride,
                                    uint16_t *pred, int p_col, int p_row,
@@ -247,27 +431,13 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
           const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
           const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
 
-          horizontal_filter(src_padded, src2_padded, tmp, sx, alpha, k,
-                            offset_bits_horiz, reduce_bits_horiz);
+          highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k,
+                              offset_bits_horiz, reduce_bits_horiz);
         }
       } else {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          int sx = sx4 + beta * (k + 4);
-
-          // Load source pixels
-          const __m128i src =
-              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-          const __m128i src2 =
-              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
-
-          horizontal_filter(src, src2, tmp, sx, alpha, k, offset_bits_horiz,
-                            reduce_bits_horiz);
-        }
+        highbd_prepare_warp_horizontal_filter(
+            ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+            offset_bits_horiz, reduce_bits_horiz);
       }
 
       // Vertical filter
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
index d1ea26290..9f2e2b457 100644
--- a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
@@ -13,7 +13,6 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-#include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_dsp/x86/convolve_common_intrin.h"
 #include "aom_dsp/x86/convolve_sse4_1.h"
@@ -21,6 +20,21 @@
 #include "aom_dsp/aom_filter.h"
 #include "av1/common/convolve.h"
 
+static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi16(w0);
+  const __m256i wt1 = _mm256_set1_epi16(w1);
+  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  return wt;
+}
+
+static INLINE __m256i load_line2_avx2(const void *a, const void *b) {
+  return _mm256_permute2x128_si256(
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)),
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
+}
+
 void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
                              int dst_stride0, int w, int h,
                              const InterpFilterParams *filter_params_x,
@@ -34,11 +48,7 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint8_t *const src_ptr = src - fo_horiz;
   const int bits = FILTER_BITS - conv_params->round_1;
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const __m256i wt0 = _mm256_set1_epi16(w0);
-  const __m256i wt1 = _mm256_set1_epi16(w1);
-  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  const __m256i wt = unpack_weights_avx2(conv_params);
   const int do_average = conv_params->do_average;
   const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
   const int offset_0 =
@@ -68,13 +78,11 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   (void)subpel_y_q4;
 
   for (i = 0; i < h; i += 2) {
+    const uint8_t *src_data = src_ptr + i * src_stride;
+    CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
     for (j = 0; j < w; j += 8) {
-      const __m256i data = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
-          _mm256_castsi128_si256(_mm_loadu_si128(
-              (__m128i *)(&src_ptr[i * src_stride + j + src_stride]))),
-          0x20);
+      const __m256i data =
+          load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
 
       __m256i res = convolve_lowbd_x(data, coeffs, filt);
 
@@ -86,13 +94,8 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
 
       // Accumulate values into the destination buffer
       if (do_average) {
-        const __m256i data_ref_0 = _mm256_permute2x128_si256(
-            _mm256_castsi128_si256(
-                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
-            _mm256_castsi128_si256(_mm_loadu_si128(
-                (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
-            0x20);
-
+        const __m256i data_ref_0 =
+            load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
         const __m256i comp_avg_res =
             comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
 
@@ -141,11 +144,7 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const __m256i round_const =
       _mm256_set1_epi32((1 << conv_params->round_1) >> 1);
   const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const __m256i wt0 = _mm256_set1_epi16(w0);
-  const __m256i wt1 = _mm256_set1_epi16(w1);
-  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  const __m256i wt = unpack_weights_avx2(conv_params);
   const int do_average = conv_params->do_average;
   const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
   const int offset_0 =
@@ -172,72 +171,35 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   for (j = 0; j < w; j += 16) {
     const uint8_t *data = &src_ptr[j];
     __m256i src6;
-
     // Load lines a and b. Line a to lower 128, line b to upper 128
-    const __m256i src_01a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
-        0x20);
-
-    const __m256i src_12a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
-        0x20);
-
-    const __m256i src_23a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
-        0x20);
-
-    const __m256i src_34a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
-        0x20);
-
-    const __m256i src_45a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
-        0x20);
-
-    src6 = _mm256_castsi128_si256(
-        _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
-    const __m256i src_56a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
-        src6, 0x20);
-
-    s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
-    s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
-    s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
-
-    s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
-    s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
-    s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+    {
+      __m256i src_ab[7];
+      __m256i src_a[7];
+      src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+      for (int kk = 0; kk < 6; ++kk) {
+        data += src_stride;
+        src_a[kk + 1] =
+            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+        src_ab[kk] = _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+      }
+      src6 = src_a[6];
+      s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+      s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+      s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
+      s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+      s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
+      s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
+    }
 
     for (i = 0; i < h; i += 2) {
-      data = &src_ptr[i * src_stride + j];
-      const __m256i src_67a = _mm256_permute2x128_si256(
-          src6,
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
-          0x20);
+      data = &src_ptr[(i + 7) * src_stride + j];
+      const __m256i src7 =
+          _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+      const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
 
       src6 = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
-      const __m256i src_78a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
-          src6, 0x20);
+          _mm_loadu_si128((__m128i *)(data + src_stride)));
+      const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
 
       s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
       s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
@@ -266,13 +228,8 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
 
       if (w - j < 16) {
         if (do_average) {
-          const __m256i data_ref_0 = _mm256_permute2x128_si256(
-              _mm256_castsi128_si256(
-                  _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
-              _mm256_castsi128_si256(_mm_loadu_si128(
-                  (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
-              0x20);
-
+          const __m256i data_ref_0 = load_line2_avx2(
+              &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
           const __m256i comp_avg_res =
               comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_jnt_comp_avg);
 
@@ -325,19 +282,12 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
             _mm256_add_epi16(res_hi_round, offset_const_2);
 
         if (do_average) {
-          const __m256i data_ref_0_lo = _mm256_permute2x128_si256(
-              _mm256_castsi128_si256(
-                  _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
-              _mm256_castsi128_si256(_mm_loadu_si128(
-                  (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
-              0x20);
-
-          const __m256i data_ref_0_hi = _mm256_permute2x128_si256(
-              _mm256_castsi128_si256(
-                  _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j + 8]))),
-              _mm256_castsi128_si256(_mm_loadu_si128(
-                  (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]))),
-              0x20);
+          const __m256i data_ref_0_lo = load_line2_avx2(
+              &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
+
+          const __m256i data_ref_0_hi =
+              load_line2_avx2(&dst[i * dst_stride + j + 8],
+                              &dst[i * dst_stride + j + 8 + dst_stride]);
 
           const __m256i comp_avg_res_lo =
               comp_avg(&data_ref_0_lo, &res_lo_unsigned, &wt, use_jnt_comp_avg);
@@ -404,11 +354,7 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const __m256i wt0 = _mm256_set1_epi16(w0);
-  const __m256i wt1 = _mm256_set1_epi16(w1);
-  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  const __m256i wt = unpack_weights_avx2(conv_params);
   const int do_average = conv_params->do_average;
   const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
   const int offset_0 =
@@ -442,15 +388,14 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   for (j = 0; j < w; j += 8) {
     /* Horizontal filter */
     {
+      const uint8_t *src_h = src_ptr + j;
       for (i = 0; i < im_h; i += 2) {
-        __m256i data = _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+        __m256i data =
+            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));
         if (i + 1 < im_h)
           data = _mm256_inserti128_si256(
-              data,
-              _mm_loadu_si128(
-                  (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
-              1);
+              data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);
+        src_h += (src_stride << 1);
         __m256i res = convolve_lowbd_x(data, coeffs_x, filt);
 
         res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
@@ -500,13 +445,9 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
           const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
 
           if (do_average) {
-            const __m256i data_ref_0 = _mm256_permute2x128_si256(
-                _mm256_castsi128_si256(
-                    _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
-                _mm256_castsi128_si256(_mm_loadu_si128(
-                    (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
-                0x20);
-
+            const __m256i data_ref_0 =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
             const __m256i comp_avg_res =
                 comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
 
@@ -534,12 +475,9 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
           const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
 
           if (do_average) {
-            const __m256i data_ref_0 = _mm256_permute2x128_si256(
-                _mm256_castsi128_si256(
-                    _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
-                _mm256_castsi128_si256(_mm_loadu_si128(
-                    (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
-                0x20);
+            const __m256i data_ref_0 =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
 
             const __m256i comp_avg_res =
                 comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
@@ -598,11 +536,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
   const __m128i left_shift = _mm_cvtsi32_si128(bits);
   const int do_average = conv_params->do_average;
   const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const __m256i wt0 = _mm256_set1_epi16(w0);
-  const __m256i wt1 = _mm256_set1_epi16(w1);
-  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  const __m256i wt = unpack_weights_avx2(conv_params);
   const __m256i zero = _mm256_setzero_si256();
 
   const int offset_0 =
@@ -663,13 +597,8 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
 
         // Accumulate values into the destination buffer
         if (do_average) {
-          const __m256i data_ref_0 = _mm256_permute2x128_si256(
-              _mm256_castsi128_si256(
-                  _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
-              _mm256_castsi128_si256(_mm_loadu_si128(
-                  (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
-              0x20);
-
+          const __m256i data_ref_0 = load_line2_avx2(
+              &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
           const __m256i comp_avg_res =
               comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
 
diff --git a/third_party/aom/av1/common/x86/reconinter_avx2.c b/third_party/aom/av1/common/x86/reconinter_avx2.c
index ffbb31849..f645e0454 100644
--- a/third_party/aom/av1/common/x86/reconinter_avx2.c
+++ b/third_party/aom/av1/common/x86/reconinter_avx2.c
@@ -16,8 +16,504 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/blend.h"
 #include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
 #include "av1/common/blockd.h"
 
+static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0,
+                                     const __m256i s1) {
+  const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1));
+  return _mm256_abs_epi16(
+      _mm256_add_epi16(mask_base, _mm256_srli_epi16(diff, 4)));
+  // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
+}
+void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
+                                          DIFFWTD_MASK_TYPE mask_type,
+                                          const uint8_t *src0, int stride0,
+                                          const uint8_t *src1, int stride1,
+                                          int h, int w) {
+  const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
+  const __m256i y_mask_base = _mm256_set1_epi16(38 - mb);
+  int i = 0;
+  if (4 == w) {
+    do {
+      const __m128i s0A = xx_loadl_32(src0);
+      const __m128i s0B = xx_loadl_32(src0 + stride0);
+      const __m128i s0C = xx_loadl_32(src0 + stride0 * 2);
+      const __m128i s0D = xx_loadl_32(src0 + stride0 * 3);
+      const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
+      const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D);
+      const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD);
+      const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD);
+
+      const __m128i s1A = xx_loadl_32(src1);
+      const __m128i s1B = xx_loadl_32(src1 + stride1);
+      const __m128i s1C = xx_loadl_32(src1 + stride1 * 2);
+      const __m128i s1D = xx_loadl_32(src1 + stride1 * 3);
+      const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
+      const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D);
+      const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD);
+      const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD);
+      const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w);
+      const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+      const __m128i x_m8 =
+          _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8));
+      xx_storeu_128(mask, x_m8);
+      src0 += (stride0 << 2);
+      src1 += (stride1 << 2);
+      mask += 16;
+      i += 4;
+    } while (i < h);
+  } else if (8 == w) {
+    do {
+      const __m128i s0A = xx_loadl_64(src0);
+      const __m128i s0B = xx_loadl_64(src0 + stride0);
+      const __m128i s0C = xx_loadl_64(src0 + stride0 * 2);
+      const __m128i s0D = xx_loadl_64(src0 + stride0 * 3);
+      const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C));
+      const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D));
+      const __m128i s1A = xx_loadl_64(src1);
+      const __m128i s1B = xx_loadl_64(src1 + stride1);
+      const __m128i s1C = xx_loadl_64(src1 + stride1 * 2);
+      const __m128i s1D = xx_loadl_64(src1 + stride1 * 3);
+      const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C));
+      const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D));
+      const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w);
+      const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w);
+      const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD);
+      yy_storeu_256(mask, m8);
+      src0 += stride0 << 2;
+      src1 += stride1 << 2;
+      mask += 32;
+      i += 4;
+    } while (i < h);
+  } else if (16 == w) {
+    do {
+      const __m128i s0A = xx_load_128(src0);
+      const __m128i s0B = xx_load_128(src0 + stride0);
+      const __m128i s1A = xx_load_128(src1);
+      const __m128i s1B = xx_load_128(src1 + stride1);
+      const __m256i s0AL = _mm256_cvtepu8_epi16(s0A);
+      const __m256i s0BL = _mm256_cvtepu8_epi16(s0B);
+      const __m256i s1AL = _mm256_cvtepu8_epi16(s1A);
+      const __m256i s1BL = _mm256_cvtepu8_epi16(s1B);
+
+      const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL);
+      const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL);
+
+      const __m256i m8 =
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8);
+      yy_storeu_256(mask, m8);
+      src0 += stride0 << 1;
+      src1 += stride1 << 1;
+      mask += 32;
+      i += 2;
+    } while (i < h);
+  } else {
+    do {
+      int j = 0;
+      do {
+        const __m256i s0 = yy_loadu_256(src0 + j);
+        const __m256i s1 = yy_loadu_256(src1 + j);
+        const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0));
+        const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1));
+        const __m256i s0H =
+            _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1));
+        const __m256i s1H =
+            _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1));
+        const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L);
+        const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H);
+        const __m256i m8 =
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8);
+        yy_storeu_256(mask + j, m8);
+        j += 32;
+      } while (j < w);
+      src0 += stride0;
+      src1 += stride1;
+      mask += w;
+      i += 1;
+    } while (i < h);
+  }
+}
+
+static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0,
+                                         const __m256i *data_src1,
+                                         const __m256i *round_const,
+                                         const __m256i *mask_base_16,
+                                         const __m256i *clip_diff, int round) {
+  const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
+  const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
+  const __m256i diff = _mm256_max_epu16(diffa, diffb);
+  const __m256i diff_round =
+      _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
+  const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+  const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
+  const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
+  return diff_clamp;
+}
+
+static INLINE __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0,
+                                             const __m256i *data_src1,
+                                             const __m256i *round_const,
+                                             const __m256i *mask_base_16,
+                                             const __m256i *clip_diff,
+                                             int round) {
+  const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
+  const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
+  const __m256i diff = _mm256_max_epu16(diffa, diffb);
+  const __m256i diff_round =
+      _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
+  const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+  const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
+  const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
+  const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp);
+  return diff_const_16;
+}
+
+static INLINE void build_compound_diffwtd_mask_d16_avx2(
+    uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
+  const int mask_base = 38;
+  const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
+  const __m256i y38 = _mm256_set1_epi16(mask_base);
+  const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  int i = 0;
+  if (w == 4) {
+    do {
+      const __m128i s0A = xx_loadl_64(src0);
+      const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+      const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+      const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+      const __m128i s1A = xx_loadl_64(src1);
+      const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+      const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+      const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+      const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
+                                      _mm_unpacklo_epi64(s0A, s0B));
+      const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
+                                      _mm_unpacklo_epi64(s1A, s1B));
+      const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+      xx_storeu_128(mask,
+                    _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
+      src0 += src0_stride << 2;
+      src1 += src1_stride << 2;
+      mask += 16;
+      i += 4;
+    } while (i < h);
+  } else if (w == 8) {
+    do {
+      const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
+      const __m256i s0CD =
+          yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
+      const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
+      const __m256i s1CD =
+          yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
+      const __m256i m16AB =
+          calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
+      const __m256i m16CD =
+          calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride << 2;
+      src1 += src1_stride << 2;
+      mask += 32;
+      i += 4;
+    } while (i < h);
+  } else if (w == 16) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + src0_stride);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + src1_stride);
+      const __m256i m16A =
+          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride << 1;
+      src1 += src1_stride << 1;
+      mask += 32;
+      i += 2;
+    } while (i < h);
+  } else if (w == 32) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i m16A =
+          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 32;
+      i += 1;
+    } while (i < h);
+  } else if (w == 64) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s0C = yy_loadu_256(src0 + 32);
+      const __m256i s0D = yy_loadu_256(src0 + 48);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i s1C = yy_loadu_256(src1 + 32);
+      const __m256i s1D = yy_loadu_256(src1 + 48);
+      const __m256i m16A =
+          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m16C =
+          calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+      const __m256i m16D =
+          calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 64;
+      i += 1;
+    } while (i < h);
+  } else {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s0C = yy_loadu_256(src0 + 32);
+      const __m256i s0D = yy_loadu_256(src0 + 48);
+      const __m256i s0E = yy_loadu_256(src0 + 64);
+      const __m256i s0F = yy_loadu_256(src0 + 80);
+      const __m256i s0G = yy_loadu_256(src0 + 96);
+      const __m256i s0H = yy_loadu_256(src0 + 112);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i s1C = yy_loadu_256(src1 + 32);
+      const __m256i s1D = yy_loadu_256(src1 + 48);
+      const __m256i s1E = yy_loadu_256(src1 + 64);
+      const __m256i s1F = yy_loadu_256(src1 + 80);
+      const __m256i s1G = yy_loadu_256(src1 + 96);
+      const __m256i s1H = yy_loadu_256(src1 + 112);
+      const __m256i m16A =
+          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m16C =
+          calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+      const __m256i m16D =
+          calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+      const __m256i m16E =
+          calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
+      const __m256i m16F =
+          calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
+      const __m256i m16G =
+          calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
+      const __m256i m16H =
+          calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
+      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+      const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
+      const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+      yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
+      yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 128;
+      i += 1;
+    } while (i < h);
+  }
+}
+
+static INLINE void build_compound_diffwtd_mask_d16_inv_avx2(
+    uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
+  const int mask_base = 38;
+  const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
+  const __m256i y38 = _mm256_set1_epi16(mask_base);
+  const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  int i = 0;
+  if (w == 4) {
+    do {
+      const __m128i s0A = xx_loadl_64(src0);
+      const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+      const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+      const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+      const __m128i s1A = xx_loadl_64(src1);
+      const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+      const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+      const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+      const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
+                                      _mm_unpacklo_epi64(s0A, s0B));
+      const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
+                                      _mm_unpacklo_epi64(s1A, s1B));
+      const __m256i m16 =
+          calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+      xx_storeu_128(mask,
+                    _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
+      src0 += src0_stride << 2;
+      src1 += src1_stride << 2;
+      mask += 16;
+      i += 4;
+    } while (i < h);
+  } else if (w == 8) {
+    do {
+      const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
+      const __m256i s0CD =
+          yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
+      const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
+      const __m256i s1CD =
+          yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
+      const __m256i m16AB =
+          calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
+      const __m256i m16CD =
+          calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride << 2;
+      src1 += src1_stride << 2;
+      mask += 32;
+      i += 4;
+    } while (i < h);
+  } else if (w == 16) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + src0_stride);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + src1_stride);
+      const __m256i m16A =
+          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride << 1;
+      src1 += src1_stride << 1;
+      mask += 32;
+      i += 2;
+    } while (i < h);
+  } else if (w == 32) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i m16A =
+          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 32;
+      i += 1;
+    } while (i < h);
+  } else if (w == 64) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s0C = yy_loadu_256(src0 + 32);
+      const __m256i s0D = yy_loadu_256(src0 + 48);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i s1C = yy_loadu_256(src1 + 32);
+      const __m256i s1D = yy_loadu_256(src1 + 48);
+      const __m256i m16A =
+          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m16C =
+          calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+      const __m256i m16D =
+          calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 64;
+      i += 1;
+    } while (i < h);
+  } else {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s0C = yy_loadu_256(src0 + 32);
+      const __m256i s0D = yy_loadu_256(src0 + 48);
+      const __m256i s0E = yy_loadu_256(src0 + 64);
+      const __m256i s0F = yy_loadu_256(src0 + 80);
+      const __m256i s0G = yy_loadu_256(src0 + 96);
+      const __m256i s0H = yy_loadu_256(src0 + 112);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i s1C = yy_loadu_256(src1 + 32);
+      const __m256i s1D = yy_loadu_256(src1 + 48);
+      const __m256i s1E = yy_loadu_256(src1 + 64);
+      const __m256i s1F = yy_loadu_256(src1 + 80);
+      const __m256i s1G = yy_loadu_256(src1 + 96);
+      const __m256i s1H = yy_loadu_256(src1 + 112);
+      const __m256i m16A =
+          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m16C =
+          calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+      const __m256i m16D =
+          calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+      const __m256i m16E =
+          calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
+      const __m256i m16F =
+          calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
+      const __m256i m16G =
+          calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
+      const __m256i m16H =
+          calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
+      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+      const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
+      const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+      yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
+      yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 128;
+      i += 1;
+    } while (i < h);
+  }
+}
+
+void av1_build_compound_diffwtd_mask_d16_avx2(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
+  const int shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+  // When rounding constant is added, there is a possibility of overflow.
+  // However that much precision is not required. Code should very well work for
+  // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
+  // there is a possibility of corner case bugs.
+  assert(DIFF_FACTOR_LOG2 == 4);
+  assert(AOM_BLEND_A64_MAX_ALPHA == 64);
+
+  if (mask_type == DIFFWTD_38) {
+    build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1,
+                                         src1_stride, h, w, shift);
+  } else {
+    build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1,
+                                             src1_stride, h, w, shift);
+  }
+}
+
 void av1_build_compound_diffwtd_mask_highbd_avx2(
     uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
     int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
diff --git a/third_party/aom/av1/common/x86/selfguided_avx2.c b/third_party/aom/av1/common/x86/selfguided_avx2.c
index 375def62e..0aaf1f454 100644
--- a/third_party/aom/av1/common/x86/selfguided_avx2.c
+++ b/third_party/aom/av1/common/x86/selfguided_avx2.c
@@ -546,17 +546,18 @@ static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
   }
 }
 
-void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
-                                     int dgd_stride, int32_t *flt0,
-                                     int32_t *flt1, int flt_stride,
-                                     int sgr_params_idx, int bit_depth,
-                                     int highbd) {
+int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+                                    int dgd_stride, int32_t *flt0,
+                                    int32_t *flt1, int flt_stride,
+                                    int sgr_params_idx, int bit_depth,
+                                    int highbd) {
   // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl,
   // Ctl and Dtl is 32-byte aligned.
   const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3);
 
-  DECLARE_ALIGNED(32, int32_t,
-                  buf[4 * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)]);
+  int32_t *buf = aom_memalign(
+      32, 4 * sizeof(*buf) * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3));
+  if (!buf) return -1;
 
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
@@ -625,6 +626,8 @@ void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
     final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
                  height, highbd);
   }
+  aom_free(buf);
+  return 0;
 }
 
 void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
@@ -635,8 +638,10 @@ void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
-  av1_selfguided_restoration_avx2(dat8, width, height, stride, flt0, flt1,
-                                  width, eps, bit_depth, highbd);
+  const int ret = av1_selfguided_restoration_avx2(
+      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+  (void)ret;
+  assert(!ret);
   const sgr_params_type *const params = &sgr_params[eps];
   int xq[2];
   decode_xq(xqd, xq, params);
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
index c64150b9d..ea3f6d942 100644
--- a/third_party/aom/av1/common/x86/selfguided_sse4.c
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -499,13 +499,15 @@ static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
   }
 }
 
-void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
-                                       int height, int dgd_stride,
-                                       int32_t *flt0, int32_t *flt1,
-                                       int flt_stride, int sgr_params_idx,
-                                       int bit_depth, int highbd) {
-  DECLARE_ALIGNED(16, int32_t, buf[4 * RESTORATION_PROC_UNIT_PELS]);
-  memset(buf, 0, sizeof(buf));
+int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
+                                      int height, int dgd_stride, int32_t *flt0,
+                                      int32_t *flt1, int flt_stride,
+                                      int sgr_params_idx, int bit_depth,
+                                      int highbd) {
+  int32_t *buf = (int32_t *)aom_memalign(
+      16, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS);
+  if (!buf) return -1;
+  memset(buf, 0, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS);
 
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
@@ -574,6 +576,8 @@ void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
     final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
                  height, highbd);
   }
+  aom_free(buf);
+  return 0;
 }
 
 void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
@@ -584,8 +588,10 @@ void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
-  av1_selfguided_restoration_sse4_1(dat8, width, height, stride, flt0, flt1,
-                                    width, eps, bit_depth, highbd);
+  const int ret = av1_selfguided_restoration_sse4_1(
+      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+  (void)ret;
+  assert(!ret);
   const sgr_params_type *const params = &sgr_params[eps];
   int xq[2];
   decode_xq(xqd, xq, params);
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse4.c b/third_party/aom/av1/common/x86/warp_plane_sse4.c
index efc542cbf..b810cea2e 100644
--- a/third_party/aom/av1/common/x86/warp_plane_sse4.c
+++ b/third_party/aom/av1/common/x86/warp_plane_sse4.c
@@ -203,15 +203,72 @@ static const uint8_t even_mask[16] = { 0, 2,  2,  4,  4,  6,  6,  8,
 static const uint8_t odd_mask[16] = { 1, 3,  3,  5,  5,  7,  7,  9,
                                       9, 11, 11, 13, 13, 15, 15, 0 };
 
-static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
-                                     int alpha, int k,
+static const uint8_t shuffle_alpha0_mask01[16] = { 0, 1, 0, 1, 0, 1, 0, 1,
+                                                   0, 1, 0, 1, 0, 1, 0, 1 };
+
+static const uint8_t shuffle_alpha0_mask23[16] = { 2, 3, 2, 3, 2, 3, 2, 3,
+                                                   2, 3, 2, 3, 2, 3, 2, 3 };
+
+static const uint8_t shuffle_alpha0_mask45[16] = { 4, 5, 4, 5, 4, 5, 4, 5,
+                                                   4, 5, 4, 5, 4, 5, 4, 5 };
+
+static const uint8_t shuffle_alpha0_mask67[16] = { 6, 7, 6, 7, 6, 7, 6, 7,
+                                                   6, 7, 6, 7, 6, 7, 6, 7 };
+
+static const uint8_t shuffle_gamma0_mask0[16] = { 0, 1, 2, 3, 0, 1, 2, 3,
+                                                  0, 1, 2, 3, 0, 1, 2, 3 };
+static const uint8_t shuffle_gamma0_mask1[16] = { 4, 5, 6, 7, 4, 5, 6, 7,
+                                                  4, 5, 6, 7, 4, 5, 6, 7 };
+static const uint8_t shuffle_gamma0_mask2[16] = { 8, 9, 10, 11, 8, 9, 10, 11,
+                                                  8, 9, 10, 11, 8, 9, 10, 11 };
+static const uint8_t shuffle_gamma0_mask3[16] = {
+  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
+};
+
+static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
                                      const int offset_bits_horiz,
-                                     const int reduce_bits_horiz) {
+                                     const int reduce_bits_horiz, int k) {
   const __m128i src_even =
       _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
   const __m128i src_odd =
       _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
+  // The pixel order we need for 'src' is:
+  // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
+  const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
+  const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]);
+  // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
+  const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
+                                            _mm_srli_si128(src_odd, 4));
+  const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]);
+  // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
+  const __m128i src_13 =
+      _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
+  const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]);
+  // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
+  const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
+                                            _mm_srli_si128(src_even, 6));
+  const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]);
+
+  const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
+                                             ((1 << reduce_bits_horiz) >> 1));
 
+  // Note: The values res_02 + res_46 and res_13 + res_57 both
+  // fit into int16s at this point, but their sum may be too wide to fit
+  // into an int16. However, once we also add round_const, the sum of
+  // all of these fits into a uint16.
+  //
+  // The wrapping behaviour of _mm_add_* is used here to make sure we
+  // get the correct result despite converting between different
+  // (implicit) types.
+  const __m128i res_even = _mm_add_epi16(res_02, res_46);
+  const __m128i res_odd = _mm_add_epi16(res_13, res_57);
+  const __m128i res =
+      _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
+  tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
+}
+
+static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
+                                                   __m128i *coeff) {
   // Filter even-index pixels
   const __m128i tmp_0 = _mm_loadl_epi64(
       (__m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
@@ -249,47 +306,504 @@ static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
   const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
 
   // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
-  const __m128i coeff_02 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+  coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14);
   // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
-  const __m128i coeff_46 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+  coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14);
   // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
-  const __m128i coeff_13 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+  coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15);
   // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
-  const __m128i coeff_57 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+  coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
 
-  // The pixel order we need for 'src' is:
-  // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
-  const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
-  const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff_02);
-  // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
-  const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
-                                            _mm_srli_si128(src_odd, 4));
-  const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff_46);
-  // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
-  const __m128i src_13 =
-      _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
-  const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff_13);
-  // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
-  const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
-                                            _mm_srli_si128(src_even, 6));
-  const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57);
+static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx,
+                                                          __m128i *coeff) {
+  // Filter even-index pixels
+  const __m128i tmp_0 =
+      _mm_loadl_epi64((__m128i *)&filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
 
-  const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
-                                             ((1 << reduce_bits_horiz) >> 1));
+  // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
+  coeff[0] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask01));
+  // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
+  coeff[1] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask23));
+  // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
+  coeff[2] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask45));
+  // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
+  coeff[3] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask67));
+}
 
-  // Note: The values res_02 + res_46 and res_13 + res_57 both
-  // fit into int16s at this point, but their sum may be too wide to fit
-  // into an int16. However, once we also add round_const, the sum of
-  // all of these fits into a uint16.
-  //
-  // The wrapping behaviour of _mm_add_* is used here to make sure we
-  // get the correct result despite converting between different
-  // (implicit) types.
-  const __m128i res_even = _mm_add_epi16(res_02, res_46);
-  const __m128i res_odd = _mm_add_epi16(res_13, res_57);
-  const __m128i res =
-      _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
-  tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
+static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
+                                     int alpha, int k,
+                                     const int offset_bits_horiz,
+                                     const int reduce_bits_horiz) {
+  __m128i coeff[4];
+  prepare_horizontal_filter_coeff(alpha, sx, coeff);
+  filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+}
+
+static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
+                                          int stride, int32_t ix4, int32_t iy4,
+                                          int32_t sx4, int alpha, int beta,
+                                          int p_height, int height, int i,
+                                          const int offset_bits_horiz,
+                                          const int reduce_bits_horiz) {
+  int k;
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+    int sx = sx4 + beta * (k + 4);
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+                      reduce_bits_horiz);
+  }
+}
+
+static INLINE void warp_horizontal_filter_alpha0(
+    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)alpha;
+  int k;
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+    int sx = sx4 + beta * (k + 4);
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+    __m128i coeff[4];
+    prepare_horizontal_filter_coeff_alpha0(sx, coeff);
+    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void warp_horizontal_filter_beta0(
+    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)beta;
+  int k;
+  __m128i coeff[4];
+  prepare_horizontal_filter_coeff(alpha, sx4, coeff);
+
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void warp_horizontal_filter_alpha0_beta0(
+    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)beta;
+  (void)alpha;
+  int k;
+
+  __m128i coeff[4];
+  prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
+
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void unpack_weights_and_set_round_const(
+    ConvolveParams *conv_params, const int round_bits, const int offset_bits,
+    __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
+  *res_sub_const =
+      _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
+                     (1 << (offset_bits - conv_params->round_1 - 1)));
+  *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  *wt = _mm_unpacklo_epi16(wt0, wt1);
+}
+
+static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
+                                                  __m128i *coeffs) {
+  const __m128i tmp_0 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_2 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_4 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_6 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+  const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+  const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+  const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+  // even coeffs
+  coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
+  coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10);
+  coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
+  coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+  const __m128i tmp_1 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_3 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_5 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_7 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+  const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+  // odd coeffs
+  coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11);
+  coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11);
+  coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+  coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy,
+                                                         __m128i *coeffs) {
+  const __m128i tmp_0 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+  // even coeffs
+  coeffs[0] =
+      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask0));
+  coeffs[1] =
+      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask1));
+  coeffs[2] =
+      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask2));
+  coeffs[3] =
+      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask3));
+
+  // odd coeffs
+  coeffs[4] = coeffs[0];
+  coeffs[5] = coeffs[1];
+  coeffs[6] = coeffs[2];
+  coeffs[7] = coeffs[3];
+}
+
+static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
+                                              __m128i *res_lo, __m128i *res_hi,
+                                              int k) {
+  // Load from tmp and rearrange pairs of consecutive rows into the
+  // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+  const __m128i *src = tmp + (k + 4);
+  const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+  const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+  const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+  const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+  const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
+  const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
+  const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
+  const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
+
+  const __m128i res_even =
+      _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6));
+
+  // Filter odd-index pixels
+  const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+  const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+  const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+  const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+  const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]);
+  const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]);
+  const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]);
+  const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]);
+
+  const __m128i res_odd =
+      _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7));
+
+  // Rearrange pixels back into the order 0 ... 7
+  *res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+  *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+}
+
+static INLINE void store_vertical_filter_output(
+    __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
+    const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
+    uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
+    const int reduce_bits_vert, int p_stride, int p_width,
+    const int round_bits) {
+  __m128i res_lo_1 = *res_lo;
+  __m128i res_hi_1 = *res_hi;
+
+  if (conv_params->is_compound) {
+    __m128i *const p =
+        (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
+    res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const),
+                              reduce_bits_vert);
+    const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1);
+    __m128i res_lo_16;
+    if (conv_params->do_average) {
+      __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+      const __m128i p_16 = _mm_loadl_epi64(p);
+
+      if (conv_params->use_jnt_comp_avg) {
+        const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
+        const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
+        const __m128i shifted_32 =
+            _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+        res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
+      } else {
+        res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
+      }
+
+      res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const);
+
+      res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const),
+                                 round_bits);
+      __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
+      *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo);
+    } else {
+      _mm_storel_epi64(p, temp_lo_16);
+    }
+    if (p_width > 4) {
+      __m128i *const p4 =
+          (__m128i *)&conv_params
+              ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+      res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const),
+                                reduce_bits_vert);
+      const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1);
+      __m128i res_hi_16;
+
+      if (conv_params->do_average) {
+        __m128i *const dst8_4 =
+            (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+        const __m128i p4_16 = _mm_loadl_epi64(p4);
+
+        if (conv_params->use_jnt_comp_avg) {
+          const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
+          const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
+          const __m128i shifted_32 =
+              _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+          res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
+        } else {
+          res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
+        }
+        res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const);
+
+        res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const),
+                                   round_bits);
+        __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
+        *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
+
+      } else {
+        _mm_storel_epi64(p4, temp_hi_16);
+      }
+    }
+  } else {
+    const __m128i res_lo_round = _mm_srai_epi32(
+        _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
+    const __m128i res_hi_round = _mm_srai_epi32(
+        _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
+
+    const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+    __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+    // Store, blending with 'pred' if needed
+    __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+    // Note: If we're outputting a 4x4 block, we need to be very careful
+    // to only output 4 pixels at this point, to avoid encode/decode
+    // mismatches when encoding with multiple threads.
+    if (p_width == 4) {
+      *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
+    } else {
+      _mm_storel_epi64(p, res_8bit);
+    }
+  }
+}
+
+static INLINE void warp_vertical_filter(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  int k;
+  __m128i res_sub_const, round_bits_const, wt;
+  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+                                     &res_sub_const, &round_bits_const, &wt);
+  // Vertical filter
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+    int sy = sy4 + delta * (k + 4);
+
+    __m128i coeffs[8];
+    prepare_vertical_filter_coeffs(gamma, sy, coeffs);
+
+    __m128i res_lo;
+    __m128i res_hi;
+    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+                                 &res_sub_const, &round_bits_const, pred,
+                                 conv_params, i, j, k, reduce_bits_vert,
+                                 p_stride, p_width, round_bits);
+  }
+}
+
+static INLINE void warp_vertical_filter_gamma0(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  int k;
+  (void)gamma;
+  __m128i res_sub_const, round_bits_const, wt;
+  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+                                     &res_sub_const, &round_bits_const, &wt);
+  // Vertical filter
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+    int sy = sy4 + delta * (k + 4);
+
+    __m128i coeffs[8];
+    prepare_vertical_filter_coeffs_gamma0(sy, coeffs);
+
+    __m128i res_lo;
+    __m128i res_hi;
+    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+                                 &res_sub_const, &round_bits_const, pred,
+                                 conv_params, i, j, k, reduce_bits_vert,
+                                 p_stride, p_width, round_bits);
+  }
+}
+
+static INLINE void warp_vertical_filter_delta0(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  (void)delta;
+  int k;
+  __m128i res_sub_const, round_bits_const, wt;
+  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+                                     &res_sub_const, &round_bits_const, &wt);
+
+  __m128i coeffs[8];
+  prepare_vertical_filter_coeffs(gamma, sy4, coeffs);
+  // Vertical filter
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+    __m128i res_lo;
+    __m128i res_hi;
+    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+                                 &res_sub_const, &round_bits_const, pred,
+                                 conv_params, i, j, k, reduce_bits_vert,
+                                 p_stride, p_width, round_bits);
+  }
+}
+
+static INLINE void warp_vertical_filter_gamma0_delta0(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  (void)delta;
+  (void)gamma;
+  int k;
+  __m128i res_sub_const, round_bits_const, wt;
+  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+                                     &res_sub_const, &round_bits_const, &wt);
+
+  __m128i coeffs[8];
+  prepare_vertical_filter_coeffs_gamma0(sy4, coeffs);
+  // Vertical filter
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+    __m128i res_lo;
+    __m128i res_hi;
+    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+                                 &res_sub_const, &round_bits_const, pred,
+                                 conv_params, i, j, k, reduce_bits_vert,
+                                 p_stride, p_width, round_bits);
+  }
+}
+
+static INLINE void prepare_warp_vertical_filter(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  if (gamma == 0 && delta == 0)
+    warp_vertical_filter_gamma0_delta0(
+        pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j,
+        sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits);
+  else if (gamma == 0 && delta != 0)
+    warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height,
+                                p_stride, p_width, i, j, sy4, reduce_bits_vert,
+                                res_add_const, round_bits, offset_bits);
+  else if (gamma != 0 && delta == 0)
+    warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height,
+                                p_stride, p_width, i, j, sy4, reduce_bits_vert,
+                                res_add_const, round_bits, offset_bits);
+  else
+    warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height,
+                         p_stride, p_width, i, j, sy4, reduce_bits_vert,
+                         res_add_const, round_bits, offset_bits);
+}
+
+static INLINE void prepare_warp_horizontal_filter(
+    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  if (alpha == 0 && beta == 0)
+    warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+                                        beta, p_height, height, i,
+                                        offset_bits_horiz, reduce_bits_horiz);
+  else if (alpha == 0 && beta != 0)
+    warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+                                  p_height, height, i, offset_bits_horiz,
+                                  reduce_bits_horiz);
+  else if (alpha != 0 && beta == 0)
+    warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+                                 p_height, height, i, offset_bits_horiz,
+                                 reduce_bits_horiz);
+  else
+    warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+                           p_height, height, i, offset_bits_horiz,
+                           reduce_bits_horiz);
 }
 
 void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
@@ -309,24 +823,12 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
 
   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
-  const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
   const __m128i reduce_bits_vert_const =
       _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
   const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
   const int round_bits =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const __m128i res_sub_const =
-      _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
-                     (1 << (offset_bits - conv_params->round_1 - 1)));
-  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
-  __m128i round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
-
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const __m128i wt0 = _mm_set1_epi16(w0);
-  const __m128i wt1 = _mm_set1_epi16(w1);
-  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
   assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
 
   /* Note: For this code to work, the left/right frame borders need to be
@@ -340,6 +842,13 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
   assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
   }
   }*/
+  __m128i res_add_const_1;
+  if (conv_params->is_compound == 1) {
+    res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const);
+  } else {
+    res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+                                     ((1 << reduce_bits_vert) >> 1));
+  }
 
   for (i = 0; i < p_height; i += 8) {
     for (j = 0; j < p_width; j += 8) {
@@ -419,203 +928,15 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
                             reduce_bits_horiz);
         }
       } else {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          int sx = sx4 + beta * (k + 4);
-
-          // Load source pixels
-          const __m128i src =
-              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-          horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
-                            reduce_bits_horiz);
-        }
+        prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha,
+                                       beta, p_height, height, i,
+                                       offset_bits_horiz, reduce_bits_horiz);
       }
 
       // Vertical filter
-      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
-        int sy = sy4 + delta * (k + 4);
-
-        // Load from tmp and rearrange pairs of consecutive rows into the
-        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
-        const __m128i *src = tmp + (k + 4);
-        const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
-        const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
-        const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
-        const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
-
-        // Filter even-index pixels
-        const __m128i tmp_0 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_2 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_4 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_6 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-        const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
-        const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
-        const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
-        const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
-        const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
-        const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
-        const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
-        const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
-        const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
-        const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
-        const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
-
-        const __m128i tmp_1 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_3 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_5 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_7 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-        const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
-        const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
-        const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
-        const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
-        const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
-        const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
-        const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
-        const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        if (conv_params->is_compound) {
-          __m128i *const p =
-              (__m128i *)&conv_params
-                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
-          res_lo = _mm_add_epi32(res_lo, res_add_const);
-          res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
-                                 reduce_bits_vert_shift);
-          const __m128i temp_lo_16 = _mm_packus_epi32(res_lo, res_lo);
-          __m128i res_lo_16;
-          if (conv_params->do_average) {
-            __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-            const __m128i p_16 = _mm_loadl_epi64(p);
-
-            if (conv_params->use_jnt_comp_avg) {
-              const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
-              const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
-              const __m128i shifted_32 =
-                  _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
-              res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
-            } else {
-              res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
-            }
-
-            res_lo_16 = _mm_add_epi16(res_lo_16, res_sub_const);
-
-            res_lo_16 = _mm_sra_epi16(
-                _mm_add_epi16(res_lo_16, round_bits_const), round_bits_shift);
-            __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
-            *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo);
-          } else {
-            _mm_storel_epi64(p, temp_lo_16);
-          }
-          if (p_width > 4) {
-            __m128i *const p4 =
-                (__m128i *)&conv_params
-                    ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
-
-            res_hi = _mm_add_epi32(res_hi, res_add_const);
-            res_hi =
-                _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
-                              reduce_bits_vert_shift);
-            const __m128i temp_hi_16 = _mm_packus_epi32(res_hi, res_hi);
-            __m128i res_hi_16;
-
-            if (conv_params->do_average) {
-              __m128i *const dst8_4 =
-                  (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
-              const __m128i p4_16 = _mm_loadl_epi64(p4);
-
-              if (conv_params->use_jnt_comp_avg) {
-                const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
-                const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, wt);
-                const __m128i shifted_32 =
-                    _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
-                res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
-              } else {
-                res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
-              }
-              res_hi_16 = _mm_add_epi16(res_hi_16, res_sub_const);
-
-              res_hi_16 = _mm_sra_epi16(
-                  _mm_add_epi16(res_hi_16, round_bits_const), round_bits_shift);
-              __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
-              *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
-
-            } else {
-              _mm_storel_epi64(p4, temp_hi_16);
-            }
-          }
-        } else {
-          // Round and pack into 8 bits
-          const __m128i round_const =
-              _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
-                             ((1 << reduce_bits_vert) >> 1));
-
-          const __m128i res_lo_round = _mm_srai_epi32(
-              _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
-          const __m128i res_hi_round = _mm_srai_epi32(
-              _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
-
-          const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
-          __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
-
-          // Store, blending with 'pred' if needed
-          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-
-          // Note: If we're outputting a 4x4 block, we need to be very careful
-          // to only output 4 pixels at this point, to avoid encode/decode
-          // mismatches when encoding with multiple threads.
-          if (p_width == 4) {
-            *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
-          } else {
-            _mm_storel_epi64(p, res_8bit);
-          }
-        }
-      }
+      prepare_warp_vertical_filter(
+          pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i,
+          j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits);
     }
   }
 }
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
index e1449fd21..87a6e1239 100644
--- a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
@@ -39,7 +39,8 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
 
   DECLARE_ALIGNED(32, uint16_t,
                   temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
-  int intermediate_height = h + SUBPEL_TAPS - 1;
+  int intermediate_height = h + SUBPEL_TAPS - 2;
+  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
   const int center_tap = ((SUBPEL_TAPS - 1) / 2);
   const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
 
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
index 3083d224b..f9d00b733 100644
--- a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
+++ b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
@@ -32,7 +32,8 @@ void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
 
   DECLARE_ALIGNED(16, uint16_t,
                   temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
-  int intermediate_height = h + SUBPEL_TAPS - 1;
+  int intermediate_height = h + SUBPEL_TAPS - 2;
+  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
   int i, j;
   const int center_tap = ((SUBPEL_TAPS - 1) / 2);
   const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
diff --git a/third_party/aom/av1/decoder/accounting.h b/third_party/aom/av1/decoder/accounting.h
index 9099d081b..288e5e63e 100644
--- a/third_party/aom/av1/decoder/accounting.h
+++ b/third_party/aom/av1/decoder/accounting.h
@@ -8,8 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AOM_ACCOUNTING_H_
-#define AOM_ACCOUNTING_H_
+#ifndef AOM_AV1_DECODER_ACCOUNTING_H_
+#define AOM_AV1_DECODER_ACCOUNTING_H_
 #include <stdlib.h>
 #include "aom/aomdx.h"
 
@@ -79,4 +79,4 @@ void aom_accounting_dump(Accounting *accounting);
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // AOM_ACCOUNTING_H_
+#endif  // AOM_AV1_DECODER_ACCOUNTING_H_
diff --git a/third_party/aom/av1/decoder/decodeframe.c b/third_party/aom/av1/decoder/decodeframe.c
index 6dbc4f3eb..31f14b531 100644
--- a/third_party/aom/av1/decoder/decodeframe.c
+++ b/third_party/aom/av1/decoder/decodeframe.c
@@ -43,6 +43,7 @@
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
 #include "av1/common/entropymv.h"
+#include "av1/common/frame_buffers.h"
 #include "av1/common/idct.h"
 #include "av1/common/mvref_common.h"
 #include "av1/common/pred_common.h"
@@ -87,18 +88,25 @@ int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
 static void set_planes_to_neutral_grey(const SequenceHeader *const seq_params,
                                        const YV12_BUFFER_CONFIG *const buf,
                                        int only_chroma) {
-  const int val = 1 << (seq_params->bit_depth - 1);
-
-  for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
-    const int is_uv = plane > 0;
-    for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) {
-      if (seq_params->use_highbitdepth) {
-        // TODO(yaowu): replace this with aom_memset16() for speed
-        for (int col_idx = 0; col_idx < buf->crop_widths[is_uv]; col_idx++) {
-          uint16_t *base = CONVERT_TO_SHORTPTR(buf->buffers[plane]);
-          base[row_idx * buf->strides[is_uv] + col_idx] = val;
+  if (seq_params->use_highbitdepth) {
+    const int val = 1 << (seq_params->bit_depth - 1);
+    for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
+      const int is_uv = plane > 0;
+      uint16_t *const base = CONVERT_TO_SHORTPTR(buf->buffers[plane]);
+      // Set the first row to neutral grey. Then copy the first row to all
+      // subsequent rows.
+      if (buf->crop_heights[is_uv] > 0) {
+        aom_memset16(base, val, buf->crop_widths[is_uv]);
+        for (int row_idx = 1; row_idx < buf->crop_heights[is_uv]; row_idx++) {
+          memcpy(&base[row_idx * buf->strides[is_uv]], base,
+                 sizeof(*base) * buf->crop_widths[is_uv]);
         }
-      } else {
+      }
+    }
+  } else {
+    for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
+      const int is_uv = plane > 0;
+      for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) {
         memset(&buf->buffers[plane][row_idx * buf->uv_stride], 1 << 7,
                buf->crop_widths[is_uv]);
       }
@@ -687,11 +695,10 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
       for (int x = 0; x < b8_w; x += b4_w) {
         MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
         is_compound = has_second_ref(this_mbmi);
-        DECLARE_ALIGNED(32, CONV_BUF_TYPE, tmp_dst[8 * 8]);
         int tmp_dst_stride = 8;
         assert(bw < 8 || bh < 8);
         ConvolveParams conv_params = get_conv_params_no_round(
-            0, 0, plane, tmp_dst, tmp_dst_stride, is_compound, xd->bd);
+            0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
         conv_params.use_jnt_comp_avg = 0;
         struct buf_2d *const dst_buf = &pd->dst;
         uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
@@ -735,7 +742,6 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
         extend_mc_border(sf, pre_buf, scaled_mv, block, subpel_x_mv,
                          subpel_y_mv, 0, is_intrabc, highbd, xd->mc_buf[ref],
                          &pre, &src_stride);
-        conv_params.ref = ref;
         conv_params.do_average = ref;
         if (is_masked_compound_type(mi->interinter_comp.type)) {
           // masked compound type has its own average mechanism
@@ -762,7 +768,6 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
     uint8_t *const dst = dst_buf->buf;
     uint8_t *pre[2];
     SubpelParams subpel_params[2];
-    DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
     int src_stride[2];
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       const struct scale_factors *const sf =
@@ -797,7 +802,7 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
     }
 
     ConvolveParams conv_params = get_conv_params_no_round(
-        0, 0, plane, tmp_dst, MAX_SB_SIZE, is_compound, xd->bd);
+        0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
     av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
                                &conv_params.bck_offset,
                                &conv_params.use_jnt_comp_avg, is_compound);
@@ -808,7 +813,6 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
       WarpTypesAllowed warp_types;
       warp_types.global_warp_allowed = is_global[ref];
       warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-      conv_params.ref = ref;
       conv_params.do_average = ref;
       if (is_masked_compound_type(mi->interinter_comp.type)) {
         // masked compound type has its own average mechanism
@@ -931,7 +935,7 @@ static void dec_build_prediction_by_above_preds(
   // Adjust mb_to_bottom_edge to have the correct value for the OBMC
   // prediction block. This is half the height of the original block,
   // except for 128-wide blocks, where we only use a height of 32.
-  int this_height = xd->n8_h * MI_SIZE;
+  int this_height = xd->n4_h * MI_SIZE;
   int pred_height = AOMMIN(this_height / 2, 32);
   xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
 
@@ -984,7 +988,7 @@ static void dec_build_prediction_by_left_preds(
   // Adjust mb_to_right_edge to have the correct value for the OBMC
   // prediction block. This is half the width of the original block,
   // except for 128-wide blocks, where we only use a width of 32.
-  int this_width = xd->n8_w * MI_SIZE;
+  int this_width = xd->n4_w * MI_SIZE;
   int pred_width = AOMMIN(this_width / 2, 32);
   xd->mb_to_right_edge += (this_width - pred_width) * 8;
 
@@ -1006,8 +1010,6 @@ static void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
                                                MACROBLOCKD *xd, int mi_row,
                                                int mi_col) {
   const int num_planes = av1_num_planes(cm);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
   uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
   int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
@@ -1018,19 +1020,23 @@ static void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
 
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     int len = sizeof(uint16_t);
-    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
-    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
-    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len);
-    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
-    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
-    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
+    dst_buf1[1] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
+    dst_buf1[2] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
+    dst_buf2[1] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
+    dst_buf2[2] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
   } else {
-    dst_buf1[0] = tmp_buf1;
-    dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
-    dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
-    dst_buf2[0] = tmp_buf2;
-    dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
-    dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
+    dst_buf1[0] = xd->tmp_obmc_bufs[0];
+    dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
+    dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
+    dst_buf2[0] = xd->tmp_obmc_bufs[1];
+    dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
+    dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
   }
   dec_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
                                       dst_width1, dst_height1, dst_stride1);
@@ -1069,8 +1075,9 @@ static void predict_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd,
   }
 
   dec_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
-  if (mbmi->motion_mode == OBMC_CAUSAL)
+  if (mbmi->motion_mode == OBMC_CAUSAL) {
     dec_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+  }
 #if CONFIG_MISMATCH_DEBUG
   for (int plane = 0; plane < num_planes; ++plane) {
     const struct macroblockd_plane *pd = &xd->plane[plane];
@@ -1225,9 +1232,18 @@ static void decode_token_recon_block(AV1Decoder *const pbi,
                     set_color_index_map_offset);
 }
 
+#if LOOP_FILTER_BITMASK
+static void store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                MB_MODE_INFO *mbmi);
+#endif
+
 static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
-                               TX_SIZE tx_size, int depth, int blk_row,
-                               int blk_col, aom_reader *r) {
+                               TX_SIZE tx_size, int depth,
+#if LOOP_FILTER_BITMASK
+                               AV1_COMMON *cm, int mi_row, int mi_col,
+#endif
+                               int blk_row, int blk_col, aom_reader *r) {
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   int is_split = 0;
   const BLOCK_SIZE bsize = mbmi->sb_type;
@@ -1271,15 +1287,29 @@ static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
       mbmi->tx_size = sub_txs;
       txfm_partition_update(xd->above_txfm_context + blk_col,
                             xd->left_txfm_context + blk_row, sub_txs, tx_size);
+#if LOOP_FILTER_BITMASK
+      store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col, BLOCK_8X8,
+                          TX_4X4, mbmi);
+#endif
       return;
     }
+#if LOOP_FILTER_BITMASK
+    if (depth + 1 == MAX_VARTX_DEPTH) {
+      store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
+                          txsize_to_bsize[tx_size], sub_txs, mbmi);
+    }
+#endif
 
     assert(bsw > 0 && bsh > 0);
     for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
       for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
         int offsetr = blk_row + row;
         int offsetc = blk_col + col;
-        read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, r);
+        read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1,
+#if LOOP_FILTER_BITMASK
+                           cm, mi_row, mi_col,
+#endif
+                           offsetr, offsetc, r);
       }
     }
   } else {
@@ -1293,6 +1323,10 @@ static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
     mbmi->tx_size = tx_size;
     txfm_partition_update(xd->above_txfm_context + blk_col,
                           xd->left_txfm_context + blk_row, tx_size, tx_size);
+#if LOOP_FILTER_BITMASK
+    store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
+                        txsize_to_bsize[tx_size], tx_size, mbmi);
+#endif
   }
 }
 
@@ -1330,6 +1364,191 @@ static TX_SIZE read_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd, int is_inter,
   }
 }
 
+#if LOOP_FILTER_BITMASK
+static void store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                MB_MODE_INFO *mbmi) {
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  const TX_SIZE tx_size_y_vert = txsize_vert_map[tx_size];
+  const TX_SIZE tx_size_y_horz = txsize_horz_map[tx_size];
+  const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const int is_square_transform_size = tx_size <= TX_64X64;
+  int mask_id = 0;
+  int offset = 0;
+  const int half_ratio_tx_size_max32 =
+      (tx_size > TX_64X64) & (tx_size <= TX_32X16);
+  if (is_square_transform_size) {
+    switch (tx_size) {
+      case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
+      case TX_8X8:
+        mask_id = mask_id_table_tx_8x8[bsize];
+        offset = 19;
+        break;
+      case TX_16X16:
+        mask_id = mask_id_table_tx_16x16[bsize];
+        offset = 33;
+        break;
+      case TX_32X32:
+        mask_id = mask_id_table_tx_32x32[bsize];
+        offset = 42;
+        break;
+      case TX_64X64: mask_id = 46; break;
+      default: assert(!is_square_transform_size); return;
+    }
+    mask_id += offset;
+  } else if (half_ratio_tx_size_max32) {
+    int tx_size_equal_block_size = bsize == txsize_to_bsize[tx_size];
+    mask_id = 47 + 2 * (tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
+  } else if (tx_size == TX_32X64) {
+    mask_id = 59;
+  } else if (tx_size == TX_64X32) {
+    mask_id = 60;
+  } else {  // quarter ratio tx size
+    mask_id = 61 + (tx_size - TX_4X16);
+  }
+  int index = 0;
+  const int row = mi_row % MI_SIZE_64X64;
+  const int col = mi_col % MI_SIZE_64X64;
+  const int shift = get_index_shift(col, row, &index);
+  const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
+  for (int i = 0; i + index < 4; ++i) {
+    // y vertical.
+    lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // y horizontal.
+    lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+    // u/v vertical.
+    lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // u/v horizontal.
+    lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+  }
+}
+
+static void store_bitmask_univariant_tx(AV1_COMMON *cm, int mi_row, int mi_col,
+                                        BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) {
+  // Use a lookup table that provides one bitmask for a given block size and
+  // a univariant transform size.
+  int index;
+  int shift;
+  int row;
+  int col;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  const TX_SIZE tx_size_y_vert = txsize_vert_map[mbmi->tx_size];
+  const TX_SIZE tx_size_y_horz = txsize_horz_map[mbmi->tx_size];
+  const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const int is_square_transform_size = mbmi->tx_size <= TX_64X64;
+  int mask_id = 0;
+  int offset = 0;
+  const int half_ratio_tx_size_max32 =
+      (mbmi->tx_size > TX_64X64) & (mbmi->tx_size <= TX_32X16);
+  if (is_square_transform_size) {
+    switch (mbmi->tx_size) {
+      case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
+      case TX_8X8:
+        mask_id = mask_id_table_tx_8x8[bsize];
+        offset = 19;
+        break;
+      case TX_16X16:
+        mask_id = mask_id_table_tx_16x16[bsize];
+        offset = 33;
+        break;
+      case TX_32X32:
+        mask_id = mask_id_table_tx_32x32[bsize];
+        offset = 42;
+        break;
+      case TX_64X64: mask_id = 46; break;
+      default: assert(!is_square_transform_size); return;
+    }
+    mask_id += offset;
+  } else if (half_ratio_tx_size_max32) {
+    int tx_size_equal_block_size = bsize == txsize_to_bsize[mbmi->tx_size];
+    mask_id =
+        47 + 2 * (mbmi->tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
+  } else if (mbmi->tx_size == TX_32X64) {
+    mask_id = 59;
+  } else if (mbmi->tx_size == TX_64X32) {
+    mask_id = 60;
+  } else {  // quarter ratio tx size
+    mask_id = 61 + (mbmi->tx_size - TX_4X16);
+  }
+  row = mi_row % MI_SIZE_64X64;
+  col = mi_col % MI_SIZE_64X64;
+  shift = get_index_shift(col, row, &index);
+  const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
+  for (int i = 0; i + index < 4; ++i) {
+    // y vertical.
+    lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // y horizontal.
+    lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+    // u/v vertical.
+    lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // u/v horizontal.
+    lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+  }
+}
+
+static void store_bitmask_other_info(AV1_COMMON *cm, int mi_row, int mi_col,
+                                     BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) {
+  int index;
+  int shift;
+  int row;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  const int row_start = mi_row % MI_SIZE_64X64;
+  const int col_start = mi_col % MI_SIZE_64X64;
+  shift = get_index_shift(col_start, row_start, &index);
+  const uint64_t top_edge_mask =
+      ((uint64_t)1 << (shift + mi_size_wide[bsize])) - ((uint64_t)1 << shift);
+  lfm->is_horz_border.bits[index] |= top_edge_mask;
+  const int is_vert_border = mask_id_table_vert_border[bsize];
+  const int vert_shift = block_size_high[bsize] <= 8 ? shift : col_start;
+  for (int i = 0; i + index < 4; ++i) {
+    lfm->is_vert_border.bits[i + index] |=
+        (left_mask_univariant_reordered[is_vert_border].bits[i] << vert_shift);
+  }
+  const int is_skip = mbmi->skip && is_inter_block(mbmi);
+  if (is_skip) {
+    const int is_skip_mask = mask_id_table_tx_4x4[bsize];
+    for (int i = 0; i + index < 4; ++i) {
+      lfm->skip.bits[i + index] |=
+          (above_mask_univariant_reordered[is_skip_mask].bits[i] << shift);
+    }
+  }
+  const uint8_t level_vert_y = get_filter_level(cm, &cm->lf_info, 0, 0, mbmi);
+  const uint8_t level_horz_y = get_filter_level(cm, &cm->lf_info, 1, 0, mbmi);
+  const uint8_t level_u = get_filter_level(cm, &cm->lf_info, 0, 1, mbmi);
+  const uint8_t level_v = get_filter_level(cm, &cm->lf_info, 0, 2, mbmi);
+  for (int r = mi_row; r < mi_row + mi_size_high[bsize]; r++) {
+    index = 0;
+    row = r % MI_SIZE_64X64;
+    memset(&lfm->lfl_y_ver[row][col_start], level_vert_y,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_y_hor[row][col_start], level_horz_y,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_u[row][col_start], level_u,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_v[row][col_start], level_v,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+  }
+}
+#endif
+
 static void parse_decode_block(AV1Decoder *const pbi, ThreadData *const td,
                                int mi_row, int mi_col, aom_reader *r,
                                PARTITION_TYPE partition, BLOCK_SIZE bsize) {
@@ -1353,14 +1572,46 @@ static void parse_decode_block(AV1Decoder *const pbi, ThreadData *const td,
 
     for (int idy = 0; idy < height; idy += bh)
       for (int idx = 0; idx < width; idx += bw)
-        read_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, r);
+        read_tx_size_vartx(xd, mbmi, max_tx_size, 0,
+#if LOOP_FILTER_BITMASK
+                           cm, mi_row, mi_col,
+#endif
+                           idy, idx, r);
   } else {
     mbmi->tx_size = read_tx_size(cm, xd, inter_block_tx, !mbmi->skip, r);
     if (inter_block_tx)
       memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
-    set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h,
+    set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h,
                   mbmi->skip && is_inter_block(mbmi), xd);
+#if LOOP_FILTER_BITMASK
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) {
+      store_bitmask_univariant_tx(cm, mi_row, mi_col, bsize, mbmi);
+    } else {
+      for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) {
+        for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) {
+          store_bitmask_univariant_tx(cm, mi_row + row, mi_col + col,
+                                      BLOCK_64X64, mbmi);
+        }
+      }
+    }
+#endif
+  }
+#if LOOP_FILTER_BITMASK
+  const int w = mi_size_wide[bsize];
+  const int h = mi_size_high[bsize];
+  if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) {
+    store_bitmask_other_info(cm, mi_row, mi_col, bsize, mbmi);
+  } else {
+    for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) {
+      for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) {
+        store_bitmask_other_info(cm, mi_row + row, mi_col + col, BLOCK_64X64,
+                                 mbmi);
+      }
+    }
   }
+#endif
 
   if (cm->delta_q_present_flag) {
     for (int i = 0; i < MAX_SEGMENTS; i++) {
@@ -1952,6 +2203,11 @@ static void setup_quantization(AV1_COMMON *const cm,
       cm->v_dc_delta_q = cm->u_dc_delta_q;
       cm->v_ac_delta_q = cm->u_ac_delta_q;
     }
+  } else {
+    cm->u_dc_delta_q = 0;
+    cm->u_ac_delta_q = 0;
+    cm->v_dc_delta_q = 0;
+    cm->v_ac_delta_q = 0;
   }
   cm->dequant_bit_depth = seq_params->bit_depth;
   cm->using_qmatrix = aom_rb_read_bit(rb);
@@ -2082,29 +2338,9 @@ static void resize_context_buffers(AV1_COMMON *cm, int width, int height) {
   cm->cur_frame->height = cm->height;
 }
 
-static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
-                             struct aom_read_bit_buffer *rb) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
-  int width, height;
+static void setup_buffer_pool(AV1_COMMON *cm) {
   BufferPool *const pool = cm->buffer_pool;
-
-  if (frame_size_override_flag) {
-    int num_bits_width = seq_params->num_bits_width;
-    int num_bits_height = seq_params->num_bits_height;
-    av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
-    if (width > seq_params->max_frame_width ||
-        height > seq_params->max_frame_height) {
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                         "Frame dimensions are larger than the maximum values");
-    }
-  } else {
-    width = seq_params->max_frame_width;
-    height = seq_params->max_frame_height;
-  }
-
-  setup_superres(cm, rb, &width, &height);
-  resize_context_buffers(cm, width, height);
-  setup_render_size(cm, rb);
+  const SequenceHeader *const seq_params = &cm->seq_params;
 
   lock_buffer_pool(pool);
   if (aom_realloc_frame_buffer(
@@ -2140,6 +2376,31 @@ static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
   pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
 }
 
+static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
+                             struct aom_read_bit_buffer *rb) {
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  int width, height;
+
+  if (frame_size_override_flag) {
+    int num_bits_width = seq_params->num_bits_width;
+    int num_bits_height = seq_params->num_bits_height;
+    av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
+    if (width > seq_params->max_frame_width ||
+        height > seq_params->max_frame_height) {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Frame dimensions are larger than the maximum values");
+    }
+  } else {
+    width = seq_params->max_frame_width;
+    height = seq_params->max_frame_height;
+  }
+
+  setup_superres(cm, rb, &width, &height);
+  resize_context_buffers(cm, width, height);
+  setup_render_size(cm, rb);
+  setup_buffer_pool(cm);
+}
+
 static void setup_sb_size(SequenceHeader *seq_params,
                           struct aom_read_bit_buffer *rb) {
   set_sb_size(seq_params, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
@@ -2158,7 +2419,6 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
   int width, height;
   int found = 0;
   int has_valid_ref_frame = 0;
-  BufferPool *const pool = cm->buffer_pool;
   for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
     if (aom_rb_read_bit(rb)) {
       YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
@@ -2208,39 +2468,7 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
       aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                          "Referenced frame has incompatible color format");
   }
-
-  lock_buffer_pool(pool);
-  if (aom_realloc_frame_buffer(
-          get_frame_new_buffer(cm), cm->width, cm->height,
-          seq_params->subsampling_x, seq_params->subsampling_y,
-          seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
-          cm->byte_alignment,
-          &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
-          pool->cb_priv)) {
-    unlock_buffer_pool(pool);
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate frame buffer");
-  }
-  unlock_buffer_pool(pool);
-
-  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x =
-      seq_params->subsampling_x;
-  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y =
-      seq_params->subsampling_y;
-  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth =
-      (unsigned int)seq_params->bit_depth;
-  pool->frame_bufs[cm->new_fb_idx].buf.color_primaries =
-      seq_params->color_primaries;
-  pool->frame_bufs[cm->new_fb_idx].buf.transfer_characteristics =
-      seq_params->transfer_characteristics;
-  pool->frame_bufs[cm->new_fb_idx].buf.matrix_coefficients =
-      seq_params->matrix_coefficients;
-  pool->frame_bufs[cm->new_fb_idx].buf.monochrome = seq_params->monochrome;
-  pool->frame_bufs[cm->new_fb_idx].buf.chroma_sample_position =
-      seq_params->chroma_sample_position;
-  pool->frame_bufs[cm->new_fb_idx].buf.color_range = seq_params->color_range;
-  pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width;
-  pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
+  setup_buffer_pool(cm);
 }
 
 // Same function as av1_read_uniform but reading from uncompresses header wb
@@ -2252,7 +2480,7 @@ static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) {
   if (v < m)
     return v;
   else
-    return (v << 1) - m + aom_rb_read_literal(rb, 1);
+    return (v << 1) - m + aom_rb_read_bit(rb);
 }
 
 static void read_tile_info_max_tile(AV1_COMMON *const cm,
@@ -2344,6 +2572,10 @@ static void read_tile_info(AV1Decoder *const pbi,
     // tile to use for cdf update
     cm->context_update_tile_id =
         aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+    if (cm->context_update_tile_id >= cm->tile_rows * cm->tile_cols) {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid context_update_tile_id");
+    }
     // tile size magnitude
     pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
   }
@@ -2746,31 +2978,13 @@ static INLINE void sync_write(AV1DecRowMTSync *const dec_row_mt_sync, int r,
 #endif  // CONFIG_MULTITHREAD
 }
 
-static INLINE int get_sb_rows_in_tile(AV1Decoder *pbi, TileInfo tile) {
-  AV1_COMMON *cm = &pbi->common;
-  int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO(
-      tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2);
-  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
-
-  return sb_rows;
-}
-
-static INLINE int get_sb_cols_in_tile(AV1Decoder *pbi, TileInfo tile) {
-  AV1_COMMON *cm = &pbi->common;
-  int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO(
-      tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2);
-  int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2;
-
-  return sb_cols;
-}
-
 static void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
                                TileInfo tile_info, const int mi_row) {
   AV1_COMMON *const cm = &pbi->common;
   const int num_planes = av1_num_planes(cm);
   TileDataDec *const tile_data =
       pbi->tile_data + tile_info.tile_row * cm->tile_cols + tile_info.tile_col;
-  const int sb_cols_in_tile = get_sb_cols_in_tile(pbi, tile_info);
+  const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
   const int sb_row_in_tile =
       (mi_row - tile_info.mi_row_start) >> cm->seq_params.mib_size_log2;
   int sb_col_in_tile = 0;
@@ -2792,15 +3006,11 @@ static void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
 }
 
 static int check_trailing_bits_after_symbol_coder(aom_reader *r) {
+  if (aom_reader_has_overflowed(r)) return -1;
+
   uint32_t nb_bits = aom_reader_tell(r);
   uint32_t nb_bytes = (nb_bits + 7) >> 3;
-
-  const uint8_t *p_begin = aom_reader_find_begin(r);
-  const uint8_t *p_end = aom_reader_find_end(r);
-
-  // It is legal to have no padding bytes (nb_bytes == p_end - p_begin).
-  if ((ptrdiff_t)nb_bytes > p_end - p_begin) return -1;
-  const uint8_t *p = p_begin + nb_bytes;
+  const uint8_t *p = aom_reader_find_begin(r) + nb_bytes;
 
   // aom_reader_tell() returns 1 for a newly initialized decoder, and the
   // return value only increases as values are decoded. So nb_bits > 0, and
@@ -2810,6 +3020,7 @@ static int check_trailing_bits_after_symbol_coder(aom_reader *r) {
   if ((last_byte & (2 * pattern - 1)) != pattern) return -1;
 
   // Make sure that all padding bytes are zero as required by the spec.
+  const uint8_t *p_end = aom_reader_find_end(r);
   while (p < p_end) {
     if (*p != 0) return -1;
     p++;
@@ -2863,6 +3074,11 @@ static void decode_tile(AV1Decoder *pbi, ThreadData *const td, int tile_row,
       // Bit-stream parsing and decoding of the superblock
       decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
                        cm->seq_params.sb_size, 0x3);
+
+      if (aom_reader_has_overflowed(td->bit_reader)) {
+        aom_merge_corrupted_flag(&td->xd.corrupted, 1);
+        return;
+      }
     }
   }
 
@@ -2950,6 +3166,11 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
   td->xd.corrupted = 0;
   td->xd.mc_buf[0] = td->mc_buf[0];
   td->xd.mc_buf[1] = td->mc_buf[1];
+  td->xd.tmp_conv_dst = td->tmp_conv_dst;
+  for (int j = 0; j < 2; ++j) {
+    td->xd.tmp_obmc_bufs[j] = td->tmp_obmc_bufs[j];
+  }
+
   for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
     const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row;
 
@@ -3236,6 +3457,7 @@ static int row_mt_worker_hook(void *arg1, void *arg2) {
 #endif
     frame_row_mt_info->row_mt_exit = 1;
 #if CONFIG_MULTITHREAD
+    pthread_cond_broadcast(pbi->row_mt_cond_);
     pthread_mutex_unlock(pbi->row_mt_mutex_);
 #endif
     return 0;
@@ -3386,16 +3608,24 @@ static void alloc_dec_jobs(AV1DecTileMT *tile_mt_info, AV1_COMMON *cm,
                   aom_malloc(sizeof(*tile_mt_info->job_queue) * num_tiles));
 }
 
-void av1_free_mc_tmp_buf(ThreadData *thread_data, int use_highbd) {
+void av1_free_mc_tmp_buf(ThreadData *thread_data) {
   int ref;
   for (ref = 0; ref < 2; ref++) {
-    if (use_highbd)
+    if (thread_data->mc_buf_use_highbd)
       aom_free(CONVERT_TO_SHORTPTR(thread_data->mc_buf[ref]));
     else
       aom_free(thread_data->mc_buf[ref]);
     thread_data->mc_buf[ref] = NULL;
   }
   thread_data->mc_buf_size = 0;
+  thread_data->mc_buf_use_highbd = 0;
+
+  aom_free(thread_data->tmp_conv_dst);
+  thread_data->tmp_conv_dst = NULL;
+  for (int i = 0; i < 2; ++i) {
+    aom_free(thread_data->tmp_obmc_bufs[i]);
+    thread_data->tmp_obmc_bufs[i] = NULL;
+  }
 }
 
 static void allocate_mc_tmp_buf(AV1_COMMON *const cm, ThreadData *thread_data,
@@ -3411,6 +3641,17 @@ static void allocate_mc_tmp_buf(AV1_COMMON *const cm, ThreadData *thread_data,
     }
   }
   thread_data->mc_buf_size = buf_size;
+  thread_data->mc_buf_use_highbd = use_highbd;
+
+  CHECK_MEM_ERROR(cm, thread_data->tmp_conv_dst,
+                  aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+                                       sizeof(*thread_data->tmp_conv_dst)));
+  for (int i = 0; i < 2; ++i) {
+    CHECK_MEM_ERROR(
+        cm, thread_data->tmp_obmc_bufs[i],
+        aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+                             sizeof(*thread_data->tmp_obmc_bufs[i])));
+  }
 }
 
 static void reset_dec_workers(AV1Decoder *pbi, AVxWorkerHook worker_hook,
@@ -3425,6 +3666,10 @@ static void reset_dec_workers(AV1Decoder *pbi, AVxWorkerHook worker_hook,
     thread_data->td->xd.corrupted = 0;
     thread_data->td->xd.mc_buf[0] = thread_data->td->mc_buf[0];
     thread_data->td->xd.mc_buf[1] = thread_data->td->mc_buf[1];
+    thread_data->td->xd.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+    for (int j = 0; j < 2; ++j) {
+      thread_data->td->xd.tmp_obmc_bufs[j] = thread_data->td->tmp_obmc_bufs[j];
+    }
     winterface->sync(worker);
 
     worker->hook = worker_hook;
@@ -3511,7 +3756,7 @@ static void decode_mt_init(AV1Decoder *pbi) {
   for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) {
     DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
     if (thread_data->td->mc_buf_size != buf_size) {
-      av1_free_mc_tmp_buf(thread_data->td, use_highbd);
+      av1_free_mc_tmp_buf(thread_data->td);
       allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd);
     }
   }
@@ -3783,8 +4028,8 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
       TileDataDec *tile_data = pbi->tile_data + row * cm->tile_cols + col;
       av1_tile_init(&tile_data->tile_info, cm, row, col);
 
-      max_sb_rows =
-          AOMMAX(max_sb_rows, get_sb_rows_in_tile(pbi, tile_data->tile_info));
+      max_sb_rows = AOMMAX(max_sb_rows,
+                           av1_get_sb_rows_in_tile(cm, tile_data->tile_info));
     }
   }
 
@@ -3905,6 +4150,8 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
 
   if (!seq_params->monochrome)
     pars->chroma_scaling_from_luma = aom_rb_read_bit(rb);
+  else
+    pars->chroma_scaling_from_luma = 0;
 
   if (seq_params->monochrome || pars->chroma_scaling_from_luma ||
       ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
@@ -4412,6 +4659,29 @@ static void show_existing_frame_reset(AV1Decoder *const pbi,
   *cm->fc = cm->frame_contexts[existing_frame_idx];
 }
 
+static INLINE void reset_frame_buffers(AV1_COMMON *cm) {
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  int i;
+
+  memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+  memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
+
+  lock_buffer_pool(cm->buffer_pool);
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    if (i != cm->new_fb_idx) {
+      frame_bufs[i].ref_count = 0;
+      cm->buffer_pool->release_fb_cb(cm->buffer_pool->cb_priv,
+                                     &frame_bufs[i].raw_frame_buffer);
+    } else {
+      assert(frame_bufs[i].ref_count == 1);
+    }
+    frame_bufs[i].cur_frame_offset = 0;
+    av1_zero(frame_bufs[i].ref_frame_offset);
+  }
+  av1_zero_unused_internal_frame_buffers(&cm->buffer_pool->int_frame_buffers);
+  unlock_buffer_pool(cm->buffer_pool);
+}
+
 // On success, returns 0. On failure, calls aom_internal_error and does not
 // return.
 static int read_uncompressed_header(AV1Decoder *pbi,
@@ -4443,6 +4713,11 @@ static int read_uncompressed_header(AV1Decoder *pbi,
     cm->reset_decoder_state = 0;
 
     if (cm->show_existing_frame) {
+      if (pbi->sequence_header_changed) {
+        aom_internal_error(
+            &cm->error, AOM_CODEC_CORRUPT_FRAME,
+            "New sequence header starts with a show_existing_frame.");
+      }
       // Show an existing frame directly.
       const int existing_frame_idx = aom_rb_read_literal(rb, 3);
       const int frame_to_show = cm->ref_frame_map[existing_frame_idx];
@@ -4493,6 +4768,18 @@ static int read_uncompressed_header(AV1Decoder *pbi,
     }
 
     cm->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2);  // 2 bits
+    if (pbi->sequence_header_changed) {
+      if (pbi->common.frame_type == KEY_FRAME) {
+        // This is the start of a new coded video sequence.
+        pbi->sequence_header_changed = 0;
+        pbi->decoding_first_frame = 1;
+        reset_frame_buffers(&pbi->common);
+      } else {
+        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                           "Sequence header has changed without a keyframe.");
+      }
+    }
+
     cm->show_frame = aom_rb_read_bit(rb);
     if (seq_params->still_picture &&
         (cm->frame_type != KEY_FRAME || !cm->show_frame)) {
@@ -4582,8 +4869,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
       }
     }
 
-    frame_size_override_flag =
-        frame_is_sframe(cm) ? 1 : aom_rb_read_literal(rb, 1);
+    frame_size_override_flag = frame_is_sframe(cm) ? 1 : aom_rb_read_bit(rb);
 
     cm->frame_offset =
         aom_rb_read_literal(rb, seq_params->order_hint_bits_minus_1 + 1);
@@ -5152,7 +5438,7 @@ static void setup_frame_info(AV1Decoder *pbi) {
   const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
   const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
   if (pbi->td.mc_buf_size != buf_size) {
-    av1_free_mc_tmp_buf(&pbi->td, use_highbd);
+    av1_free_mc_tmp_buf(&pbi->td);
     allocate_mc_tmp_buf(cm, &pbi->td, buf_size, use_highbd);
   }
 }
@@ -5166,6 +5452,11 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
   const int tile_count_tg = end_tile - start_tile + 1;
 
   if (initialize_flag) setup_frame_info(pbi);
+  const int num_planes = av1_num_planes(cm);
+#if LOOP_FILTER_BITMASK
+  av1_loop_filter_frame_init(cm, 0, num_planes);
+  av1_zero_array(cm->lf.lfm, cm->lf.lfm_num);
+#endif
 
   if (pbi->max_threads > 1 && !(cm->large_scale_tile && !pbi->ext_tile_debug) &&
       pbi->row_mt)
@@ -5177,7 +5468,6 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
   else
     *p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile);
 
-  const int num_planes = av1_num_planes(cm);
   // If the bit stream is monochrome, set the U and V buffers to a constant.
   if (num_planes < 3) {
     set_planes_to_neutral_grey(&cm->seq_params, xd->cur_buf, 1);
@@ -5190,7 +5480,7 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
   if (!cm->allow_intrabc && !cm->single_tile_decoding) {
     if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
 #if LOOP_FILTER_BITMASK
-      av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb, 0,
+      av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb, 1, 0,
                             num_planes, 0);
 #else
       if (pbi->num_workers > 1) {
@@ -5255,6 +5545,7 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
 
   if (!xd->corrupted) {
     if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+      assert(cm->context_update_tile_id < pbi->allocated_tiles);
       *cm->fc = pbi->tile_data[cm->context_update_tile_id].tctx;
       av1_reset_cdf_symbol_counters(cm->fc);
     }
diff --git a/third_party/aom/av1/decoder/decodeframe.h b/third_party/aom/av1/decoder/decodeframe.h
index d289b31f2..ddad273f1 100644
--- a/third_party/aom/av1/decoder/decodeframe.h
+++ b/third_party/aom/av1/decoder/decodeframe.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_DECODER_DECODEFRAME_H_
-#define AV1_DECODER_DECODEFRAME_H_
+#ifndef AOM_AV1_DECODER_DECODEFRAME_H_
+#define AOM_AV1_DECODER_DECODEFRAME_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -74,7 +74,7 @@ struct aom_read_bit_buffer *av1_init_read_bit_buffer(
     struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
     const uint8_t *data_end);
 
-void av1_free_mc_tmp_buf(struct ThreadData *thread_data, int use_highbd);
+void av1_free_mc_tmp_buf(struct ThreadData *thread_data);
 
 void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm);
 
@@ -82,4 +82,4 @@ void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm);
 }  // extern "C"
 #endif
 
-#endif  // AV1_DECODER_DECODEFRAME_H_
+#endif  // AOM_AV1_DECODER_DECODEFRAME_H_
diff --git a/third_party/aom/av1/decoder/decodemv.c b/third_party/aom/av1/decoder/decodemv.c
index 5e920b18d..551e4d543 100644
--- a/third_party/aom/av1/decoder/decodemv.c
+++ b/third_party/aom/av1/decoder/decodemv.c
@@ -94,42 +94,26 @@ static int read_delta_qindex(AV1_COMMON *cm, const MACROBLOCKD *xd,
   }
   return reduced_delta_qindex;
 }
-static int read_delta_lflevel(AV1_COMMON *cm, const MACROBLOCKD *xd,
-                              aom_reader *r, int lf_id,
-                              MB_MODE_INFO *const mbmi, int mi_col,
+static int read_delta_lflevel(const AV1_COMMON *const cm, aom_reader *r,
+                              aom_cdf_prob *const cdf,
+                              const MB_MODE_INFO *const mbmi, int mi_col,
                               int mi_row) {
-  int sign, abs, reduced_delta_lflevel = 0;
-  BLOCK_SIZE bsize = mbmi->sb_type;
+  int reduced_delta_lflevel = 0;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
   const int b_col = mi_col & (cm->seq_params.mib_size - 1);
   const int b_row = mi_row & (cm->seq_params.mib_size - 1);
   const int read_delta_lf_flag = (b_col == 0 && b_row == 0);
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
   if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) &&
       read_delta_lf_flag) {
-    if (cm->delta_lf_multi) {
-      assert(lf_id >= 0 &&
-             lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT
-                                             : FRAME_LF_COUNT - 2));
-      abs = aom_read_symbol(r, ec_ctx->delta_lf_multi_cdf[lf_id],
-                            DELTA_LF_PROBS + 1, ACCT_STR);
-    } else {
-      abs = aom_read_symbol(r, ec_ctx->delta_lf_cdf, DELTA_LF_PROBS + 1,
-                            ACCT_STR);
-    }
+    int abs = aom_read_symbol(r, cdf, DELTA_LF_PROBS + 1, ACCT_STR);
     const int smallval = (abs < DELTA_LF_SMALL);
     if (!smallval) {
       const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
       const int thr = (1 << rem_bits) + 1;
       abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
     }
-
-    if (abs) {
-      sign = aom_read_bit(r, ACCT_STR);
-    } else {
-      sign = 1;
-    }
-
+    const int sign = abs ? aom_read_bit(r, ACCT_STR) : 1;
     reduced_delta_lflevel = sign ? -abs : abs;
   }
   return reduced_delta_lflevel;
@@ -618,19 +602,22 @@ static void read_filter_intra_mode_info(const AV1_COMMON *const cm,
 void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
                       int blk_col, TX_SIZE tx_size, aom_reader *r) {
   MB_MODE_INFO *mbmi = xd->mi[0];
-  const int inter_block = is_inter_block(mbmi);
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-
   const int txk_type_idx =
       av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
   TX_TYPE *tx_type = &mbmi->txk_type[txk_type_idx];
+  *tx_type = DCT_DCT;
+
+  // No need to read transform type if block is skipped.
+  if (mbmi->skip || segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+    return;
+
+  // No need to read transform type for lossless mode(qindex==0).
+  const int qindex =
+      cm->seg.enabled ? xd->qindex[mbmi->segment_id] : cm->base_qindex;
+  if (qindex <= 0) return;
 
-  const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
-  if (get_ext_tx_types(tx_size, inter_block, cm->reduced_tx_set_used) > 1 &&
-      ((!cm->seg.enabled && cm->base_qindex > 0) ||
-       (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
-      !mbmi->skip &&
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+  const int inter_block = is_inter_block(mbmi);
+  if (get_ext_tx_types(tx_size, inter_block, cm->reduced_tx_set_used) > 1) {
     const TxSetType tx_set_type =
         av1_get_ext_tx_set_type(tx_size, inter_block, cm->reduced_tx_set_used);
     const int eset =
@@ -639,23 +626,22 @@ void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
     // there is no need to read the tx_type
     assert(eset != 0);
 
+    const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
     if (inter_block) {
       *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
           r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
           av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
     } else {
-      PREDICTION_MODE intra_dir;
-      if (mbmi->filter_intra_mode_info.use_filter_intra)
-        intra_dir =
-            fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode];
-      else
-        intra_dir = mbmi->mode;
+      const PREDICTION_MODE intra_mode =
+          mbmi->filter_intra_mode_info.use_filter_intra
+              ? fimode_to_intradir[mbmi->filter_intra_mode_info
+                                       .filter_intra_mode]
+              : mbmi->mode;
       *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
-          r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_dir],
+          r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_mode],
           av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
     }
-  } else {
-    *tx_type = DCT_DCT;
   }
 }
 
@@ -720,6 +706,43 @@ static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
   }
 }
 
+// If delta q is present, reads delta_q index.
+// Also reads delta_q loop filter levels, if present.
+static void read_delta_q_params(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                const int mi_row, const int mi_col,
+                                aom_reader *r) {
+  if (cm->delta_q_present_flag) {
+    MB_MODE_INFO *const mbmi = xd->mi[0];
+    xd->current_qindex +=
+        read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res;
+    /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
+    xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
+    FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+    if (cm->delta_lf_present_flag) {
+      if (cm->delta_lf_multi) {
+        const int frame_lf_count =
+            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+          const int tmp_lvl =
+              xd->delta_lf[lf_id] +
+              read_delta_lflevel(cm, r, ec_ctx->delta_lf_multi_cdf[lf_id], mbmi,
+                                 mi_col, mi_row) *
+                  cm->delta_lf_res;
+          mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] =
+              clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+        }
+      } else {
+        const int tmp_lvl = xd->delta_lf_from_base +
+                            read_delta_lflevel(cm, r, ec_ctx->delta_lf_cdf,
+                                               mbmi, mi_col, mi_row) *
+                                cm->delta_lf_res;
+        mbmi->delta_lf_from_base = xd->delta_lf_from_base =
+            clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+      }
+    }
+  }
+}
+
 static void read_intra_frame_mode_info(AV1_COMMON *const cm,
                                        MACROBLOCKD *const xd, int mi_row,
                                        int mi_col, aom_reader *r) {
@@ -743,33 +766,7 @@ static void read_intra_frame_mode_info(AV1_COMMON *const cm,
 
   read_cdef(cm, r, xd, mi_col, mi_row);
 
-  if (cm->delta_q_present_flag) {
-    xd->current_qindex +=
-        read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res;
-    /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
-    xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
-    if (cm->delta_lf_present_flag) {
-      if (cm->delta_lf_multi) {
-        const int frame_lf_count =
-            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
-        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
-          const int tmp_lvl =
-              xd->delta_lf[lf_id] +
-              read_delta_lflevel(cm, xd, r, lf_id, mbmi, mi_col, mi_row) *
-                  cm->delta_lf_res;
-          mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] =
-              clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
-        }
-      } else {
-        const int tmp_lvl =
-            xd->delta_lf_from_base +
-            read_delta_lflevel(cm, xd, r, -1, mbmi, mi_col, mi_row) *
-                cm->delta_lf_res;
-        mbmi->delta_lf_from_base = xd->delta_lf_from_base =
-            clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
-      }
-    }
-  }
+  read_delta_q_params(cm, xd, mi_row, mi_col, r);
 
   mbmi->current_qindex = xd->current_qindex;
 
@@ -1402,7 +1399,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
   mbmi->motion_mode = SIMPLE_TRANSLATION;
   if (is_motion_variation_allowed_bsize(mbmi->sb_type) && !mbmi->skip_mode &&
       !has_second_ref(mbmi))
-    mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+    mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
   av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
 
   if (mbmi->ref_frame[1] != INTRA_FRAME)
@@ -1463,20 +1460,20 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
   read_mb_interp_filter(cm, xd, mbmi, r);
 
   if (mbmi->motion_mode == WARPED_CAUSAL) {
-    mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE;
-    mbmi->wm_params[0].invalid = 0;
+    mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
+    mbmi->wm_params.invalid = 0;
 
-    if (mbmi->num_proj_ref[0] > 1)
-      mbmi->num_proj_ref[0] = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
-                                            mbmi->num_proj_ref[0], bsize);
+    if (mbmi->num_proj_ref > 1)
+      mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+                                         mbmi->num_proj_ref, bsize);
 
-    if (find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
+    if (find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
                         mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
-                        &mbmi->wm_params[0], mi_row, mi_col)) {
+                        &mbmi->wm_params, mi_row, mi_col)) {
 #if WARPED_MOTION_DEBUG
       printf("Warning: unexpected warped model from aomenc\n");
 #endif
-      mbmi->wm_params[0].invalid = 1;
+      mbmi->wm_params.invalid = 1;
     }
   }
 
@@ -1512,33 +1509,7 @@ static void read_inter_frame_mode_info(AV1Decoder *const pbi,
 
   read_cdef(cm, r, xd, mi_col, mi_row);
 
-  if (cm->delta_q_present_flag) {
-    xd->current_qindex +=
-        read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res;
-    /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
-    xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
-    if (cm->delta_lf_present_flag) {
-      if (cm->delta_lf_multi) {
-        const int frame_lf_count =
-            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
-        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
-          const int tmp_lvl =
-              xd->delta_lf[lf_id] +
-              read_delta_lflevel(cm, xd, r, lf_id, mbmi, mi_col, mi_row) *
-                  cm->delta_lf_res;
-          mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] =
-              clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
-        }
-      } else {
-        const int tmp_lvl =
-            xd->delta_lf_from_base +
-            read_delta_lflevel(cm, xd, r, -1, mbmi, mi_col, mi_row) *
-                cm->delta_lf_res;
-        mbmi->delta_lf_from_base = xd->delta_lf_from_base =
-            clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
-      }
-    }
-  }
+  read_delta_q_params(cm, xd, mi_row, mi_col, r);
 
   if (!mbmi->skip_mode)
     inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
diff --git a/third_party/aom/av1/decoder/decodemv.h b/third_party/aom/av1/decoder/decodemv.h
index 6243bb168..1625e5bd2 100644
--- a/third_party/aom/av1/decoder/decodemv.h
+++ b/third_party/aom/av1/decoder/decodemv.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_DECODER_DECODEMV_H_
-#define AV1_DECODER_DECODEMV_H_
+#ifndef AOM_AV1_DECODER_DECODEMV_H_
+#define AOM_AV1_DECODER_DECODEMV_H_
 
 #include "aom_dsp/bitreader.h"
 
@@ -32,4 +32,4 @@ void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd,
 void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
                       int blk_col, TX_SIZE tx_size, aom_reader *r);
 
-#endif  // AV1_DECODER_DECODEMV_H_
+#endif  // AOM_AV1_DECODER_DECODEMV_H_
diff --git a/third_party/aom/av1/decoder/decoder.c b/third_party/aom/av1/decoder/decoder.c
index e978fad6c..a5f4fd67f 100644
--- a/third_party/aom/av1/decoder/decoder.c
+++ b/third_party/aom/av1/decoder/decoder.c
@@ -37,16 +37,11 @@
 #include "av1/decoder/obu.h"
 
 static void initialize_dec(void) {
-  static volatile int init_done = 0;
-
-  if (!init_done) {
-    av1_rtcd();
-    aom_dsp_rtcd();
-    aom_scale_rtcd();
-    av1_init_intra_predictors();
-    av1_init_wedge_masks();
-    init_done = 1;
-  }
+  av1_rtcd();
+  aom_dsp_rtcd();
+  aom_scale_rtcd();
+  av1_init_intra_predictors();
+  av1_init_wedge_masks();
 }
 
 static void dec_setup_mi(AV1_COMMON *cm) {
@@ -171,8 +166,7 @@ void av1_decoder_remove(AV1Decoder *pbi) {
   if (pbi->thread_data) {
     for (int worker_idx = 0; worker_idx < pbi->max_threads - 1; worker_idx++) {
       DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
-      const int use_highbd = pbi->common.seq_params.use_highbitdepth ? 1 : 0;
-      av1_free_mc_tmp_buf(thread_data->td, use_highbd);
+      av1_free_mc_tmp_buf(thread_data->td);
       aom_free(thread_data->td);
     }
     aom_free(pbi->thread_data);
@@ -209,8 +203,7 @@ void av1_decoder_remove(AV1Decoder *pbi) {
 #if CONFIG_ACCOUNTING
   aom_accounting_clear(&pbi->accounting);
 #endif
-  const int use_highbd = pbi->common.seq_params.use_highbitdepth ? 1 : 0;
-  av1_free_mc_tmp_buf(&pbi->td, use_highbd);
+  av1_free_mc_tmp_buf(&pbi->td);
 
   aom_free(pbi);
 }
diff --git a/third_party/aom/av1/decoder/decoder.h b/third_party/aom/av1/decoder/decoder.h
index 610b98d95..5ca939c24 100644
--- a/third_party/aom/av1/decoder/decoder.h
+++ b/third_party/aom/av1/decoder/decoder.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_DECODER_DECODER_H_
-#define AV1_DECODER_DECODER_H_
+#ifndef AOM_AV1_DECODER_DECODER_H_
+#define AOM_AV1_DECODER_DECODER_H_
 
 #include "config/aom_config.h"
 
@@ -55,6 +55,11 @@ typedef struct ThreadData {
   CB_BUFFER cb_buffer_base;
   uint8_t *mc_buf[2];
   int32_t mc_buf_size;
+  int mc_buf_use_highbd;  // Boolean: whether the byte pointers stored in
+                          // mc_buf were converted from highbd pointers.
+
+  CONV_BUF_TYPE *tmp_conv_dst;
+  uint8_t *tmp_obmc_bufs[2];
 
   decode_block_visitor_fn_t read_coeffs_tx_intra_block_visit;
   decode_block_visitor_fn_t predict_and_recon_intra_block_visit;
@@ -199,6 +204,7 @@ typedef struct AV1Decoder {
   int tg_start;  // First tile in the current tilegroup
   int tg_size_bit_offset;
   int sequence_header_ready;
+  int sequence_header_changed;
 #if CONFIG_INSPECTION
   aom_inspect_cb inspect_cb;
   void *inspect_ctx;
@@ -308,4 +314,4 @@ typedef void (*block_visitor_fn_t)(AV1Decoder *const pbi, ThreadData *const td,
 }  // extern "C"
 #endif
 
-#endif  // AV1_DECODER_DECODER_H_
+#endif  // AOM_AV1_DECODER_DECODER_H_
diff --git a/third_party/aom/av1/decoder/decodetxb.h b/third_party/aom/av1/decoder/decodetxb.h
index 687bba958..fe04f6abd 100644
--- a/third_party/aom/av1/decoder/decodetxb.h
+++ b/third_party/aom/av1/decoder/decodetxb.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef DECODETXB_H_
-#define DECODETXB_H_
+#ifndef AOM_AV1_DECODER_DECODETXB_H_
+#define AOM_AV1_DECODER_DECODETXB_H_
 
 #include "config/aom_config.h"
 
@@ -29,4 +29,4 @@ void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
                                 MACROBLOCKD *const xd, aom_reader *const r,
                                 const int plane, const int row, const int col,
                                 const TX_SIZE tx_size);
-#endif  //  DECODETXB_H_
+#endif  // AOM_AV1_DECODER_DECODETXB_H_
diff --git a/third_party/aom/av1/decoder/detokenize.h b/third_party/aom/av1/decoder/detokenize.h
index ec85bf7ea..173b437a9 100644
--- a/third_party/aom/av1/decoder/detokenize.h
+++ b/third_party/aom/av1/decoder/detokenize.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_DECODER_DETOKENIZE_H_
-#define AV1_DECODER_DETOKENIZE_H_
+#ifndef AOM_AV1_DECODER_DETOKENIZE_H_
+#define AOM_AV1_DECODER_DETOKENIZE_H_
 
 #include "config/aom_config.h"
 
@@ -26,4 +26,4 @@ void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane, aom_reader *r);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
-#endif  // AV1_DECODER_DETOKENIZE_H_
+#endif  // AOM_AV1_DECODER_DETOKENIZE_H_
diff --git a/third_party/aom/av1/decoder/dthread.h b/third_party/aom/av1/decoder/dthread.h
index 9f854e015..1d264b07e 100644
--- a/third_party/aom/av1/decoder/dthread.h
+++ b/third_party/aom/av1/decoder/dthread.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_DECODER_DTHREAD_H_
-#define AV1_DECODER_DTHREAD_H_
+#ifndef AOM_AV1_DECODER_DTHREAD_H_
+#define AOM_AV1_DECODER_DTHREAD_H_
 
 #include "config/aom_config.h"
 
@@ -79,4 +79,4 @@ void av1_frameworker_copy_context(AVxWorker *const dst_worker,
 }  // extern "C"
 #endif
 
-#endif  // AV1_DECODER_DTHREAD_H_
+#endif  // AOM_AV1_DECODER_DTHREAD_H_
diff --git a/third_party/aom/av1/decoder/inspection.h b/third_party/aom/av1/decoder/inspection.h
index bb604f684..7214a9bed 100644
--- a/third_party/aom/av1/decoder/inspection.h
+++ b/third_party/aom/av1/decoder/inspection.h
@@ -8,8 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AOM_INSPECTION_H_
-#define AOM_INSPECTION_H_
+#ifndef AOM_AV1_DECODER_INSPECTION_H_
+#define AOM_AV1_DECODER_INSPECTION_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -81,4 +81,4 @@ int ifd_inspect(insp_frame_data *fd, void *decoder);
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // AOM_INSPECTION_H_
+#endif  // AOM_AV1_DECODER_INSPECTION_H_
diff --git a/third_party/aom/av1/decoder/obu.c b/third_party/aom/av1/decoder/obu.c
index 715bc6837..44ecf818e 100644
--- a/third_party/aom/av1/decoder/obu.c
+++ b/third_party/aom/av1/decoder/obu.c
@@ -18,6 +18,7 @@
 #include "aom_ports/mem_ops.h"
 
 #include "av1/common/common.h"
+#include "av1/common/obu_util.h"
 #include "av1/common/timing.h"
 #include "av1/decoder/decoder.h"
 #include "av1/decoder/decodeframe.h"
@@ -42,85 +43,6 @@ typedef enum {
   SCALABILITY_SS = 14
 } SCALABILITY_STRUCTURES;
 
-// Returns 1 when OBU type is valid, and 0 otherwise.
-static int valid_obu_type(int obu_type) {
-  int valid_type = 0;
-  switch (obu_type) {
-    case OBU_SEQUENCE_HEADER:
-    case OBU_TEMPORAL_DELIMITER:
-    case OBU_FRAME_HEADER:
-    case OBU_TILE_GROUP:
-    case OBU_METADATA:
-    case OBU_FRAME:
-    case OBU_REDUNDANT_FRAME_HEADER:
-    case OBU_TILE_LIST:
-    case OBU_PADDING: valid_type = 1; break;
-    default: break;
-  }
-  return valid_type;
-}
-
-// Parses OBU header and stores values in 'header'.
-static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb,
-                                       int is_annexb, ObuHeader *header) {
-  if (!rb || !header) return AOM_CODEC_INVALID_PARAM;
-
-  const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer;
-  if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME;
-
-  header->size = 1;
-
-  if (aom_rb_read_bit(rb) != 0) {
-    // Forbidden bit. Must not be set.
-    return AOM_CODEC_CORRUPT_FRAME;
-  }
-
-  header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4);
-
-  if (!valid_obu_type(header->type)) return AOM_CODEC_CORRUPT_FRAME;
-
-  header->has_extension = aom_rb_read_bit(rb);
-  header->has_size_field = aom_rb_read_bit(rb);
-
-  if (!header->has_size_field && !is_annexb) {
-    // section 5 obu streams must have obu_size field set.
-    return AOM_CODEC_UNSUP_BITSTREAM;
-  }
-
-  if (aom_rb_read_bit(rb) != 0) {
-    // obu_reserved_1bit must be set to 0.
-    return AOM_CODEC_CORRUPT_FRAME;
-  }
-
-  if (header->has_extension) {
-    if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME;
-
-    header->size += 1;
-    header->temporal_layer_id = aom_rb_read_literal(rb, 3);
-    header->spatial_layer_id = aom_rb_read_literal(rb, 2);
-    if (aom_rb_read_literal(rb, 3) != 0) {
-      // extension_header_reserved_3bits must be set to 0.
-      return AOM_CODEC_CORRUPT_FRAME;
-    }
-  }
-
-  return AOM_CODEC_OK;
-}
-
-aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
-                                    size_t *consumed, ObuHeader *header,
-                                    int is_annexb) {
-  if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM;
-
-  // TODO(tomfinegan): Set the error handler here and throughout this file, and
-  // confirm parsing work done via aom_read_bit_buffer is successful.
-  struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL,
-                                    NULL };
-  aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header);
-  if (parse_result == AOM_CODEC_OK) *consumed = header->size;
-  return parse_result;
-}
-
 aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
     int operating_point_idc, unsigned int *number_spatial_layers,
     unsigned int *number_temporal_layers) {
@@ -208,7 +130,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
   SequenceHeader *const seq_params = &sh;
 
   seq_params->profile = av1_read_profile(rb);
-  if (seq_params->profile > PROFILE_2) {
+  if (seq_params->profile > CONFIG_MAX_DECODE_PROFILE) {
     cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
     return 0;
   }
@@ -349,10 +271,8 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
   // If a sequence header has been decoded before, we check if the new
   // one is consistent with the old one.
   if (pbi->sequence_header_ready) {
-    if (!are_seq_headers_consistent(&cm->seq_params, seq_params)) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                         "Inconsistent sequence headers received.");
-    }
+    if (!are_seq_headers_consistent(&cm->seq_params, seq_params))
+      pbi->sequence_header_changed = 1;
   }
 
   cm->seq_params = *seq_params;
@@ -620,9 +540,9 @@ static void read_metadata_hdr_mdcv(const uint8_t *data, size_t sz) {
 
 static void scalability_structure(struct aom_read_bit_buffer *rb) {
   int spatial_layers_cnt = aom_rb_read_literal(rb, 2);
-  int spatial_layer_dimensions_present_flag = aom_rb_read_literal(rb, 1);
-  int spatial_layer_description_present_flag = aom_rb_read_literal(rb, 1);
-  int temporal_group_description_present_flag = aom_rb_read_literal(rb, 1);
+  int spatial_layer_dimensions_present_flag = aom_rb_read_bit(rb);
+  int spatial_layer_description_present_flag = aom_rb_read_bit(rb);
+  int temporal_group_description_present_flag = aom_rb_read_bit(rb);
   aom_rb_read_literal(rb, 3);  // reserved
 
   if (spatial_layer_dimensions_present_flag) {
@@ -643,8 +563,8 @@ static void scalability_structure(struct aom_read_bit_buffer *rb) {
     temporal_group_size = aom_rb_read_literal(rb, 8);
     for (i = 0; i < temporal_group_size; i++) {
       aom_rb_read_literal(rb, 3);
-      aom_rb_read_literal(rb, 1);
-      aom_rb_read_literal(rb, 1);
+      aom_rb_read_bit(rb);
+      aom_rb_read_bit(rb);
       int temporal_group_ref_cnt = aom_rb_read_literal(rb, 3);
       for (j = 0; j < temporal_group_ref_cnt; j++) {
         aom_rb_read_literal(rb, 8);
@@ -716,61 +636,6 @@ static size_t read_metadata(const uint8_t *data, size_t sz) {
   return sz;
 }
 
-static aom_codec_err_t read_obu_size(const uint8_t *data,
-                                     size_t bytes_available,
-                                     size_t *const obu_size,
-                                     size_t *const length_field_size) {
-  uint64_t u_obu_size = 0;
-  if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) !=
-      0) {
-    return AOM_CODEC_CORRUPT_FRAME;
-  }
-
-  if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME;
-  *obu_size = (size_t)u_obu_size;
-  return AOM_CODEC_OK;
-}
-
-aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
-                                             size_t bytes_available,
-                                             int is_annexb,
-                                             ObuHeader *obu_header,
-                                             size_t *const payload_size,
-                                             size_t *const bytes_read) {
-  size_t length_field_size = 0, obu_size = 0;
-  aom_codec_err_t status;
-
-  if (is_annexb) {
-    // Size field comes before the OBU header, and includes the OBU header
-    status =
-        read_obu_size(data, bytes_available, &obu_size, &length_field_size);
-
-    if (status != AOM_CODEC_OK) return status;
-  }
-
-  struct aom_read_bit_buffer rb = { data + length_field_size,
-                                    data + bytes_available, 0, NULL, NULL };
-
-  status = read_obu_header(&rb, is_annexb, obu_header);
-  if (status != AOM_CODEC_OK) return status;
-
-  if (is_annexb) {
-    // Derive the payload size from the data we've already read
-    if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME;
-
-    *payload_size = obu_size - obu_header->size;
-  } else {
-    // Size field comes after the OBU header, and is just the payload size
-    status = read_obu_size(data + obu_header->size,
-                           bytes_available - obu_header->size, payload_size,
-                           &length_field_size);
-    if (status != AOM_CODEC_OK) return status;
-  }
-
-  *bytes_read = length_field_size + obu_header->size;
-  return AOM_CODEC_OK;
-}
-
 // On success, returns a boolean that indicates whether the decoding of the
 // current frame is finished. On failure, sets cm->error.error_code and
 // returns -1.
@@ -781,8 +646,6 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
   int frame_decoding_finished = 0;
   int is_first_tg_obu_received = 1;
   uint32_t frame_header_size = 0;
-  int seq_header_received = 0;
-  size_t seq_header_size = 0;
   ObuHeader obu_header;
   memset(&obu_header, 0, sizeof(obu_header));
   pbi->seen_frame_header = 0;
@@ -853,19 +716,8 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
         pbi->seen_frame_header = 0;
         break;
       case OBU_SEQUENCE_HEADER:
-        if (!seq_header_received) {
-          decoded_payload_size = read_sequence_header_obu(pbi, &rb);
-          if (cm->error.error_code != AOM_CODEC_OK) return -1;
-
-          seq_header_size = decoded_payload_size;
-          seq_header_received = 1;
-        } else {
-          // Seeing another sequence header, skip as all sequence headers are
-          // required to be identical except for the contents of
-          // operating_parameters_info and the amount of trailing bits.
-          // TODO(yaowu): verifying redundant sequence headers are identical.
-          decoded_payload_size = seq_header_size;
-        }
+        decoded_payload_size = read_sequence_header_obu(pbi, &rb);
+        if (cm->error.error_code != AOM_CODEC_OK) return -1;
         break;
       case OBU_FRAME_HEADER:
       case OBU_REDUNDANT_FRAME_HEADER:
@@ -889,6 +741,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
           assert(rb.bit_offset == 0);
           rb.bit_offset = 8 * frame_header_size;
         }
+
         decoded_payload_size = frame_header_size;
         pbi->frame_header_size = frame_header_size;
 
@@ -938,6 +791,11 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
         decoded_payload_size = read_metadata(data, payload_size);
         break;
       case OBU_TILE_LIST:
+        if (CONFIG_NORMAL_TILE_MODE) {
+          cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+          return -1;
+        }
+
         // This OBU type is purely for the large scale tile coding mode.
         // The common camera frame header has to be already decoded.
         if (!pbi->camera_frame_header_ready) {
diff --git a/third_party/aom/av1/decoder/obu.h b/third_party/aom/av1/decoder/obu.h
index 5f2197058..5ab243fc9 100644
--- a/third_party/aom/av1/decoder/obu.h
+++ b/third_party/aom/av1/decoder/obu.h
@@ -9,35 +9,12 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_DECODER_OBU_H
-#define AV1_DECODER_OBU_H
+#ifndef AOM_AV1_DECODER_OBU_H_
+#define AOM_AV1_DECODER_OBU_H_
 
 #include "aom/aom_codec.h"
 #include "av1/decoder/decoder.h"
 
-typedef struct {
-  size_t size;  // Size (1 or 2 bytes) of the OBU header (including the
-                // optional OBU extension header) in the bitstream.
-  OBU_TYPE type;
-  int has_size_field;
-  int has_extension;
-  // The following fields come from the OBU extension header and therefore are
-  // only used if has_extension is true.
-  int temporal_layer_id;
-  int spatial_layer_id;
-} ObuHeader;
-
-aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
-                                    size_t *consumed, ObuHeader *header,
-                                    int is_annexb);
-
-aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
-                                             size_t bytes_available,
-                                             int is_annexb,
-                                             ObuHeader *obu_header,
-                                             size_t *const payload_size,
-                                             size_t *const bytes_read);
-
 // Try to decode one frame from a buffer.
 // Returns 1 if we decoded a frame,
 //         0 if we didn't decode a frame but that's okay
@@ -51,4 +28,4 @@ aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
     int operating_point_idc, unsigned int *num_spatial_layers,
     unsigned int *num_temporal_layers);
 
-#endif
+#endif  // AOM_AV1_DECODER_OBU_H_
diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c
index b721b6d2b..80f8e2e66 100644
--- a/third_party/aom/av1/encoder/aq_complexity.c
+++ b/third_party/aom/av1/encoder/aq_complexity.c
@@ -143,9 +143,10 @@ void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
         get_aq_c_strength(cm->base_qindex, cm->seq_params.bit_depth);
 
     aom_clear_system_state();
-    low_var_thresh = (cpi->oxcf.pass == 2) ? AOMMAX(cpi->twopass.mb_av_energy,
-                                                    MIN_DEFAULT_LV_THRESH)
-                                           : DEFAULT_LV_THRESH;
+    low_var_thresh =
+        (cpi->oxcf.pass == 2)
+            ? AOMMAX(exp(cpi->twopass.mb_av_energy), MIN_DEFAULT_LV_THRESH)
+            : DEFAULT_LV_THRESH;
 
     av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes);
     logvar = av1_log_block_var(cpi, mb, bs);
diff --git a/third_party/aom/av1/encoder/aq_complexity.h b/third_party/aom/av1/encoder/aq_complexity.h
index af525b36d..3421d74c9 100644
--- a/third_party/aom/av1/encoder/aq_complexity.h
+++ b/third_party/aom/av1/encoder/aq_complexity.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_AQ_COMPLEXITY_H_
-#define AV1_ENCODER_AQ_COMPLEXITY_H_
+#ifndef AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
+#define AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -34,4 +34,4 @@ void av1_setup_in_frame_q_adj(struct AV1_COMP *cpi);
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_AQ_COMPLEXITY_H_
+#endif  // AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
index dec2c730d..f532d48da 100644
--- a/third_party/aom/av1/encoder/aq_cyclicrefresh.c
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
@@ -80,9 +80,11 @@ CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
 }
 
 void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
-  aom_free(cr->map);
-  aom_free(cr->last_coded_q_map);
-  aom_free(cr);
+  if (cr != NULL) {
+    aom_free(cr->map);
+    aom_free(cr->last_coded_q_map);
+    aom_free(cr);
+  }
 }
 
 // Check if we should turn off cyclic refresh based on bitrate condition.
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.h b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
index 459ab80b8..b45781983 100644
--- a/third_party/aom/av1/encoder/aq_cyclicrefresh.h
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_AQ_CYCLICREFRESH_H_
-#define AV1_ENCODER_AQ_CYCLICREFRESH_H_
+#ifndef AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
+#define AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
 
 #include "av1/common/blockd.h"
 
@@ -95,4 +95,4 @@ static INLINE int cyclic_refresh_segment_id(int segment_id) {
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_AQ_CYCLICREFRESH_H_
+#endif  // AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c
index 6cb6adc42..58f906bdc 100644
--- a/third_party/aom/av1/encoder/aq_variance.c
+++ b/third_party/aom/av1/encoder/aq_variance.c
@@ -14,34 +14,33 @@
 #include "aom_ports/mem.h"
 
 #include "av1/encoder/aq_variance.h"
-
 #include "av1/common/seg_common.h"
+#include "av1/encoder/encodeframe.h"
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/dwt.h"
 #include "aom_ports/system_state.h"
 
+static const double rate_ratio[MAX_SEGMENTS] = { 2.2, 1.7, 1.3, 1.0,
+                                                 0.9, .8,  .7,  .6 };
+
+static const double deltaq_rate_ratio[MAX_SEGMENTS] = { 2.5,  2.0, 1.5, 1.0,
+                                                        0.75, 1.0, 1.0, 1.0 };
 #define ENERGY_MIN (-4)
 #define ENERGY_MAX (1)
 #define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1)
 #define ENERGY_IN_BOUNDS(energy) \
   assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
 
-static const double rate_ratio[MAX_SEGMENTS] = { 2.5,  2.0, 1.5, 1.0,
-                                                 0.75, 1.0, 1.0, 1.0 };
-static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 };
-
-#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN]
-
 DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 };
+
 DECLARE_ALIGNED(16, static const uint16_t,
                 av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
 
-unsigned int av1_vaq_segment_id(int energy) {
-  ENERGY_IN_BOUNDS(energy);
-  return SEGMENT_ID(energy);
-}
+static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 };
+
+#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN]
 
 void av1_vaq_frame_setup(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
@@ -51,6 +50,12 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
   int resolution_change =
       cm->prev_frame && (cm->width != cm->prev_frame->width ||
                          cm->height != cm->prev_frame->height);
+  int avg_energy = (int)(cpi->twopass.mb_av_energy - 2);
+  double avg_ratio;
+  if (avg_energy > 7) avg_energy = 7;
+  if (avg_energy < 0) avg_energy = 0;
+  avg_ratio = rate_ratio[avg_energy];
+
   if (resolution_change) {
     memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
     av1_clearall_segfeatures(seg);
@@ -69,9 +74,11 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
     aom_clear_system_state();
 
     for (i = 0; i < MAX_SEGMENTS; ++i) {
-      int qindex_delta =
-          av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
-                                     rate_ratio[i], cm->seq_params.bit_depth);
+      // Set up avg segment id to be 1.0 and adjust the other segments around
+      // it.
+      int qindex_delta = av1_compute_qdelta_by_rate(
+          &cpi->rc, cm->frame_type, cm->base_qindex, rate_ratio[i] / avg_ratio,
+          cm->seq_params.bit_depth);
 
       // We don't allow qindex 0 in a segment if the base value is not 0.
       // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
@@ -87,114 +94,58 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
   }
 }
 
-/* TODO(agrange, paulwilkins): The block_variance calls the unoptimized versions
- * of variance() and highbd_8_variance(). It should not.
- */
-static void aq_variance(const uint8_t *a, int a_stride, const uint8_t *b,
-                        int b_stride, int w, int h, unsigned int *sse,
-                        int *sum) {
-  int i, j;
-
-  *sum = 0;
-  *sse = 0;
+int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+  // This functions returns a score for the blocks local variance as calculated
+  // by: sum of the log of the (4x4 variances) of each subblock to the current
+  // block (x,bs)
+  // * 32 / number of pixels in the block_size.
+  // This is used for segmentation because to avoid situations in which a large
+  // block with a gentle gradient gets marked high variance even though each
+  // subblock has a low variance.   This allows us to assign the same segment
+  // number for the same sorts of area regardless of how the partitioning goes.
 
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-}
-
-static void aq_highbd_variance64(const uint8_t *a8, int a_stride,
-                                 const uint8_t *b8, int b_stride, int w, int h,
-                                 uint64_t *sse, uint64_t *sum) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  double var = 0;
+  unsigned int sse;
   int i, j;
 
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-    a += a_stride;
-    b += b_stride;
-  }
-}
-
-static void aq_highbd_8_variance(const uint8_t *a8, int a_stride,
-                                 const uint8_t *b8, int b_stride, int w, int h,
-                                 unsigned int *sse, int *sum) {
-  uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
-  aq_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sse = (unsigned int)sse_long;
-  *sum = (int)sum_long;
-}
-
-static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                   BLOCK_SIZE bs) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  unsigned int var, sse;
   int right_overflow =
       (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
   int bottom_overflow =
       (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
 
-  if (right_overflow || bottom_overflow) {
-    const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
-    const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
-    int avg;
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
-                           CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, bw, bh,
-                           &sse, &avg);
-      sse >>= 2 * (xd->bd - 8);
-      avg >>= (xd->bd - 8);
-    } else {
-      aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0,
-                  bw, bh, &sse, &avg);
-    }
-    var = sse - (unsigned int)(((int64_t)avg * avg) / (bw * bh));
-    return (unsigned int)((uint64_t)var * 256) / (bw * bh);
-  } else {
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      var =
-          cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
-                             CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse);
-    } else {
-      var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
-                               av1_all_zeros, 0, &sse);
+  const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
+  const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
+
+  aom_clear_system_state();
+
+  for (i = 0; i < bh; i += 4) {
+    for (j = 0; j < bw; j += 4) {
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        var +=
+            log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
+                          x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+                          x->plane[0].src.stride,
+                          CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) /
+                          16);
+      } else {
+        var +=
+            log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
+                          x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+                          x->plane[0].src.stride, av1_all_zeros, 0, &sse) /
+                          16);
+      }
     }
-    return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs];
   }
-}
+  // Use average of 4x4 log variance. The range for 8 bit 0 - 9.704121561.
+  var /= (bw / 4 * bh / 4);
+  if (var > 7) var = 7;
 
-double av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
-  unsigned int var = block_variance(cpi, x, bs);
   aom_clear_system_state();
-  return log(var + 1.0);
+  return (int)(var);
 }
 
 #define DEFAULT_E_MIDPOINT 10.0
-int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
-  double energy;
-  double energy_midpoint;
-  aom_clear_system_state();
-  energy_midpoint =
-      (cpi->oxcf.pass == 2) ? cpi->twopass.mb_av_energy : DEFAULT_E_MIDPOINT;
-  energy = av1_log_block_var(cpi, x, bs) - energy_midpoint;
-  return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
-}
 
 unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
   MACROBLOCKD *xd = &x->e_mbd;
@@ -231,17 +182,21 @@ int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
 
 int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi,
                                          int block_var_level) {
-  ENERGY_IN_BOUNDS(block_var_level);
-
-  const int rate_level = SEGMENT_ID(block_var_level);
+  int rate_level;
   const AV1_COMMON *const cm = &cpi->common;
+
+  if (DELTAQ_MODULATION == 1) {
+    ENERGY_IN_BOUNDS(block_var_level);
+    rate_level = SEGMENT_ID(block_var_level);
+  } else {
+    rate_level = block_var_level;
+  }
   int qindex_delta = av1_compute_qdelta_by_rate(
-      &cpi->rc, cm->frame_type, cm->base_qindex, rate_ratio[rate_level],
+      &cpi->rc, cm->frame_type, cm->base_qindex, deltaq_rate_ratio[rate_level],
       cm->seq_params.bit_depth);
 
   if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
     qindex_delta = -cm->base_qindex + 1;
   }
-
   return qindex_delta;
 }
diff --git a/third_party/aom/av1/encoder/aq_variance.h b/third_party/aom/av1/encoder/aq_variance.h
index b1a8bc38a..2d22b663e 100644
--- a/third_party/aom/av1/encoder/aq_variance.h
+++ b/third_party/aom/av1/encoder/aq_variance.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_AQ_VARIANCE_H_
-#define AV1_ENCODER_AQ_VARIANCE_H_
+#ifndef AOM_AV1_ENCODER_AQ_VARIANCE_H_
+#define AOM_AV1_ENCODER_AQ_VARIANCE_H_
 
 #include "av1/encoder/encoder.h"
 
@@ -18,11 +18,9 @@
 extern "C" {
 #endif
 
-unsigned int av1_vaq_segment_id(int energy);
 void av1_vaq_frame_setup(AV1_COMP *cpi);
 
-int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
-double av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
 int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi,
                                          int block_var_level);
 int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
@@ -32,4 +30,4 @@ int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_AQ_VARIANCE_H_
+#endif  // AOM_AV1_ENCODER_AQ_VARIANCE_H_
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.c b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
index b92b3469f..98505e0b1 100644
--- a/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
@@ -11,24 +11,7 @@
 
 #include <stdlib.h>
 #include "av1/encoder/av1_fwd_txfm1d.h"
-
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-void range_check_func(int32_t stage, const int32_t *input, const int32_t *buf,
-                      int32_t size, int8_t bit);
-
-#define range_check(stage, input, buf, size, bit) \
-  range_check_func(stage, input, buf, size, bit)
-#else  // CONFIG_COEFFICIENT_RANGE_CHECKING
-
-#define range_check(stage, input, buf, size, bit) \
-  {                                               \
-    (void)stage;                                  \
-    (void)input;                                  \
-    (void)buf;                                    \
-    (void)size;                                   \
-    (void)bit;                                    \
-  }
-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+#include "av1/common/av1_txfm.h"
 
 void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
                    const int8_t *stage_range) {
@@ -40,7 +23,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   int32_t step[4];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
@@ -49,7 +32,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[1] = input[1] + input[2];
   bf1[2] = -input[2] + input[1];
   bf1[3] = -input[3] + input[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -60,7 +43,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
   bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
   bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -70,7 +53,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[1] = bf0[2];
   bf1[2] = bf0[1];
   bf1[3] = bf0[3];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 }
 
 void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -83,7 +66,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   int32_t step[8];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
@@ -96,7 +79,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = -input[5] + input[2];
   bf1[6] = -input[6] + input[1];
   bf1[7] = -input[7] + input[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -111,7 +94,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
   bf1[7] = bf0[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -126,7 +109,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = -bf0[5] + bf0[4];
   bf1[6] = -bf0[6] + bf0[7];
   bf1[7] = bf0[7] + bf0[6];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
@@ -141,7 +124,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
   bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
   bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
@@ -155,7 +138,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = bf0[5];
   bf1[6] = bf0[3];
   bf1[7] = bf0[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 }
 
 void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -168,7 +151,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   int32_t step[16];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
@@ -189,7 +172,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = -input[13] + input[2];
   bf1[14] = -input[14] + input[1];
   bf1[15] = -input[15] + input[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -212,7 +195,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
   bf1[14] = bf0[14];
   bf1[15] = bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -235,7 +218,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = -bf0[13] + bf0[14];
   bf1[14] = bf0[14] + bf0[13];
   bf1[15] = bf0[15] + bf0[12];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
@@ -258,7 +241,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
   bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
   bf1[15] = bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
@@ -281,7 +264,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = -bf0[13] + bf0[12];
   bf1[14] = -bf0[14] + bf0[15];
   bf1[15] = bf0[15] + bf0[14];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
@@ -304,7 +287,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
   bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
   bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
@@ -326,7 +309,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = bf0[11];
   bf1[14] = bf0[7];
   bf1[15] = bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 }
 
 void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -339,7 +322,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   int32_t step[32];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
@@ -376,7 +359,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = -input[29] + input[2];
   bf1[30] = -input[30] + input[1];
   bf1[31] = -input[31] + input[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -415,7 +398,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = bf0[29];
   bf1[30] = bf0[30];
   bf1[31] = bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -454,7 +437,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = bf0[29] + bf0[26];
   bf1[30] = bf0[30] + bf0[25];
   bf1[31] = bf0[31] + bf0[24];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
@@ -493,7 +476,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
   bf1[30] = bf0[30];
   bf1[31] = bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
@@ -532,7 +515,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = -bf0[29] + bf0[30];
   bf1[30] = bf0[30] + bf0[29];
   bf1[31] = bf0[31] + bf0[28];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
@@ -571,7 +554,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
   bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
   bf1[31] = bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
@@ -610,7 +593,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = -bf0[29] + bf0[28];
   bf1[30] = -bf0[30] + bf0[31];
   bf1[31] = bf0[31] + bf0[30];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 8
   stage++;
@@ -649,7 +632,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
   bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
   bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 9
   stage++;
@@ -687,7 +670,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = bf0[23];
   bf1[30] = bf0[15];
   bf1[31] = bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 }
 
 void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -698,7 +681,7 @@ void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   int32_t s0, s1, s2, s3, s4, s5, s6, s7;
 
   // stage 0
-  range_check(0, input, input, 4, stage_range[0]);
+  av1_range_check_buf(0, input, input, 4, stage_range[0]);
   x0 = input[0];
   x1 = input[1];
   x2 = input[2];
@@ -746,7 +729,7 @@ void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   output[1] = round_shift(s1, bit);
   output[2] = round_shift(s2, bit);
   output[3] = round_shift(s3, bit);
-  range_check(6, input, output, 4, stage_range[6]);
+  av1_range_check_buf(6, input, output, 4, stage_range[6]);
 }
 
 void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -759,7 +742,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   int32_t step[8];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
@@ -773,7 +756,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = input[6];
   bf1[6] = input[2];
   bf1[7] = -input[5];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -788,7 +771,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = bf0[5];
   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
   bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -802,7 +785,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = bf0[5] + bf0[7];
   bf1[6] = bf0[4] - bf0[6];
   bf1[7] = bf0[5] - bf0[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
@@ -817,7 +800,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
   bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
   bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
@@ -831,7 +814,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = bf0[1] - bf0[5];
   bf1[6] = bf0[2] - bf0[6];
   bf1[7] = bf0[3] - bf0[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
@@ -846,7 +829,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
   bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
   bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
@@ -860,7 +843,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = bf0[2];
   bf1[6] = bf0[7];
   bf1[7] = bf0[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 }
 
 void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -873,7 +856,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   int32_t step[16];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
@@ -895,7 +878,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = -input[13];
   bf1[14] = -input[5];
   bf1[15] = input[10];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -918,7 +901,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = bf0[13];
   bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
   bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -940,7 +923,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = bf0[13] + bf0[15];
   bf1[14] = bf0[12] - bf0[14];
   bf1[15] = bf0[13] - bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
@@ -963,7 +946,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
   bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
   bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
@@ -985,7 +968,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = bf0[9] - bf0[13];
   bf1[14] = bf0[10] - bf0[14];
   bf1[15] = bf0[11] - bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
@@ -1008,7 +991,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
   bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
   bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
@@ -1030,7 +1013,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = bf0[5] - bf0[13];
   bf1[14] = bf0[6] - bf0[14];
   bf1[15] = bf0[7] - bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 8
   stage++;
@@ -1053,7 +1036,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
   bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
   bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 9
   stage++;
@@ -1075,7 +1058,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = bf0[2];
   bf1[14] = bf0[15];
   bf1[15] = bf0[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 }
 
 void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -1084,14 +1067,14 @@ void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
   for (int i = 0; i < 4; ++i)
     output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits);
   assert(stage_range[0] + NewSqrt2Bits <= 32);
-  range_check(0, input, output, 4, stage_range[0]);
+  av1_range_check_buf(0, input, output, 4, stage_range[0]);
 }
 
 void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
                       const int8_t *stage_range) {
   (void)cos_bit;
   for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
-  range_check(0, input, output, 8, stage_range[0]);
+  av1_range_check_buf(0, input, output, 8, stage_range[0]);
 }
 
 void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -1100,14 +1083,14 @@ void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
   for (int i = 0; i < 16; ++i)
     output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits);
   assert(stage_range[0] + NewSqrt2Bits <= 32);
-  range_check(0, input, output, 16, stage_range[0]);
+  av1_range_check_buf(0, input, output, 16, stage_range[0]);
 }
 
 void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
                        const int8_t *stage_range) {
   (void)cos_bit;
   for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
-  range_check(0, input, output, 32, stage_range[0]);
+  av1_range_check_buf(0, input, output, 32, stage_range[0]);
 }
 
 void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -1120,7 +1103,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   int32_t step[64];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
@@ -1189,7 +1172,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = -input[61] + input[2];
   bf1[62] = -input[62] + input[1];
   bf1[63] = -input[63] + input[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -1260,7 +1243,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = bf0[61];
   bf1[62] = bf0[62];
   bf1[63] = bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -1331,7 +1314,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = bf0[61] + bf0[50];
   bf1[62] = bf0[62] + bf0[49];
   bf1[63] = bf0[63] + bf0[48];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
@@ -1402,7 +1385,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = bf0[61];
   bf1[62] = bf0[62];
   bf1[63] = bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
@@ -1473,7 +1456,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = bf0[61] + bf0[58];
   bf1[62] = bf0[62] + bf0[57];
   bf1[63] = bf0[63] + bf0[56];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
@@ -1544,7 +1527,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
   bf1[62] = bf0[62];
   bf1[63] = bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
@@ -1615,7 +1598,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = -bf0[61] + bf0[62];
   bf1[62] = bf0[62] + bf0[61];
   bf1[63] = bf0[63] + bf0[60];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 8
   stage++;
@@ -1686,7 +1669,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
   bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
   bf1[63] = bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 9
   stage++;
@@ -1757,7 +1740,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = -bf0[61] + bf0[60];
   bf1[62] = -bf0[62] + bf0[63];
   bf1[63] = bf0[63] + bf0[62];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 10
   stage++;
@@ -1828,7 +1811,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
   bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
   bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 11
   stage++;
@@ -1898,5 +1881,5 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = bf0[47];
   bf1[62] = bf0[31];
   bf1[63] = bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 }
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
index 9472af8e6..9dcf16552 100644
--- a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_FWD_TXFM1D_H_
-#define AV1_FWD_TXFM1D_H_
+#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
+#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
 
 #include "av1/common/av1_txfm.h"
 
@@ -46,4 +46,4 @@ void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
 }
 #endif
 
-#endif  // AV1_FWD_TXFM1D_H_
+#endif  // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
index 174689a14..98b6530db 100644
--- a/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
@@ -9,11 +9,11 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_FWD_TXFM2D_CFG_H_
-#define AV1_FWD_TXFM2D_CFG_H_
+#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
+#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
 #include "av1/common/enums.h"
 #include "av1/encoder/av1_fwd_txfm1d.h"
 extern const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL];
 extern const int8_t fwd_cos_bit_col[5][5];
 extern const int8_t fwd_cos_bit_row[5][5];
-#endif  // AV1_FWD_TXFM2D_CFG_H_
+#endif  // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c
index d0477b35b..a0a926005 100644
--- a/third_party/aom/av1/encoder/av1_quantize.c
+++ b/third_party/aom/av1/encoder/av1_quantize.c
@@ -273,35 +273,32 @@ void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
-  // obsolete skip_block
-  const int skip_block = 0;
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
   if (qm_ptr != NULL && iqm_ptr != NULL) {
-    quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
-                        p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
-                        qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
-                        sc->scan, sc->iscan, qm_ptr, iqm_ptr,
-                        qparam->log_scale);
+    quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                        p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                        dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                        sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
   } else {
     switch (qparam->log_scale) {
       case 0:
-        aom_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
-                       p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
-                       qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
-                       sc->scan, sc->iscan);
+        aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                       p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                       dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                       sc->iscan);
         break;
       case 1:
-        aom_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
-                             p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
-                             qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
-                             sc->scan, sc->iscan);
+        aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                             p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                             dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                             sc->iscan);
         break;
       case 2:
-        aom_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
-                             p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
-                             qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
-                             sc->scan, sc->iscan);
+        aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                             p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                             dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                             sc->iscan);
         break;
       default: assert(0);
     }
@@ -392,28 +389,25 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                                   const SCAN_ORDER *sc,
                                   const QUANT_PARAM *qparam) {
-  // obsolete skip_block
-  const int skip_block = 0;
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
   if (qm_ptr != NULL && iqm_ptr != NULL) {
-    highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
-                               p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
-                               qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
-                               sc->scan, sc->iscan, qm_ptr, iqm_ptr,
-                               qparam->log_scale);
+    highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                               p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                               dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                               sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
   } else {
     switch (qparam->log_scale) {
       case 0:
         if (LIKELY(n_coeffs >= 8)) {
-          aom_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
-                                p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
-                                qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
-                                eob_ptr, sc->scan, sc->iscan);
+          aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                                p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                                dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                                sc->iscan);
         } else {
           // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
           // quantization
-          aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+          aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, p->zbin_QTX,
                                   p->round_QTX, p->quant_QTX,
                                   p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
                                   p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
@@ -421,15 +415,15 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
         break;
       case 1:
         aom_highbd_quantize_b_32x32(
-            coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, p->round_QTX,
-            p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
-            p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
+            coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+            p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+            eob_ptr, sc->scan, sc->iscan);
         break;
       case 2:
         aom_highbd_quantize_b_64x64(
-            coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, p->round_QTX,
-            p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
-            p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
+            coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+            p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+            eob_ptr, sc->scan, sc->iscan);
         break;
       default: assert(0);
     }
diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h
index eaf8374de..35af9a67a 100644
--- a/third_party/aom/av1/encoder/av1_quantize.h
+++ b/third_party/aom/av1/encoder/av1_quantize.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_QUANTIZE_H_
-#define AV1_ENCODER_QUANTIZE_H_
+#ifndef AOM_AV1_ENCODER_AV1_QUANTIZE_H_
+#define AOM_AV1_ENCODER_AV1_QUANTIZE_H_
 
 #include "config/aom_config.h"
 
@@ -145,4 +145,4 @@ void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_QUANTIZE_H_
+#endif  // AOM_AV1_ENCODER_AV1_QUANTIZE_H_
diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c
index 2070755cd..2c4acdb02 100644
--- a/third_party/aom/av1/encoder/bitstream.c
+++ b/third_party/aom/av1/encoder/bitstream.c
@@ -18,6 +18,7 @@
 #include "aom_dsp/binary_codes_writer.h"
 #include "aom_dsp/bitwriter_buffer.h"
 #include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
 #include "aom_ports/mem_ops.h"
 #include "aom_ports/system_state.h"
 #if CONFIG_BITSTREAM_DEBUG
@@ -30,7 +31,6 @@
 #include "av1/common/entropymode.h"
 #include "av1/common/entropymv.h"
 #include "av1/common/mvref_common.h"
-#include "av1/common/odintrin.h"
 #include "av1/common/pred_common.h"
 #include "av1/common/reconinter.h"
 #include "av1/common/reconintra.h"
@@ -66,11 +66,11 @@ static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
                                              aom_writer *const w, int plane,
                                              FRAME_COUNTS *counts);
 
-static void write_intra_mode_kf(FRAME_CONTEXT *frame_ctx,
-                                const MB_MODE_INFO *mi,
-                                const MB_MODE_INFO *above_mi,
-                                const MB_MODE_INFO *left_mi,
-                                PREDICTION_MODE mode, aom_writer *w) {
+static void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx,
+                                  const MB_MODE_INFO *mi,
+                                  const MB_MODE_INFO *above_mi,
+                                  const MB_MODE_INFO *left_mi,
+                                  PREDICTION_MODE mode, aom_writer *w) {
   assert(!is_intrabc_block(mi));
   (void)mi;
   aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi),
@@ -297,7 +297,7 @@ static void write_delta_qindex(const MACROBLOCKD *xd, int delta_qindex,
                    DELTA_Q_PROBS + 1);
 
   if (!smallval) {
-    rem_bits = OD_ILOG_NZ(abs - 1) - 1;
+    rem_bits = get_msb(abs - 1);
     thr = (1 << rem_bits) + 1;
     aom_write_literal(w, rem_bits - 1, 3);
     aom_write_literal(w, abs - thr, rem_bits);
@@ -326,7 +326,7 @@ static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   }
 
   if (!smallval) {
-    rem_bits = OD_ILOG_NZ(abs - 1) - 1;
+    rem_bits = get_msb(abs - 1);
     thr = (1 << rem_bits) + 1;
     aom_write_literal(w, rem_bits - 1, 3);
     aom_write_literal(w, abs - thr, rem_bits);
@@ -836,8 +836,8 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
   }
 }
 
-static void write_intra_mode(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize,
-                             PREDICTION_MODE mode, aom_writer *w) {
+static void write_intra_y_mode_nonkf(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize,
+                                     PREDICTION_MODE mode, aom_writer *w) {
   aom_write_symbol(w, mode, frame_ctx->y_mode_cdf[size_group_lookup[bsize]],
                    INTRA_MODES);
 }
@@ -933,45 +933,24 @@ static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w,
   }
 }
 
-static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
-                                const int mi_col, aom_writer *w) {
+// If delta q is present, writes delta_q index.
+// Also writes delta_q loop filter levels, if present.
+static void write_delta_q_params(AV1_COMP *cpi, const int mi_row,
+                                 const int mi_col, int skip, aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->td.mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  const struct segmentation *const seg = &cm->seg;
-  struct segmentation_probs *const segp = &ec_ctx->seg;
-  const MB_MODE_INFO *const mbmi = xd->mi[0];
-  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-  const PREDICTION_MODE mode = mbmi->mode;
-  const int segment_id = mbmi->segment_id;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const int allow_hp = cm->allow_high_precision_mv;
-  const int is_inter = is_inter_block(mbmi);
-  const int is_compound = has_second_ref(mbmi);
-  int skip, ref;
-  (void)mi_row;
-  (void)mi_col;
-
-  write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, 0, 1);
-
-  write_skip_mode(cm, xd, segment_id, mbmi, w);
-
-  assert(IMPLIES(mbmi->skip_mode, mbmi->skip));
-  skip = mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w);
-
-  write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, skip, 0);
-
-  write_cdef(cm, xd, w, skip, mi_col, mi_row);
-
   if (cm->delta_q_present_flag) {
-    int super_block_upper_left =
+    MACROBLOCK *const x = &cpi->td.mb;
+    MACROBLOCKD *const xd = &x->e_mbd;
+    const MB_MODE_INFO *const mbmi = xd->mi[0];
+    const BLOCK_SIZE bsize = mbmi->sb_type;
+    const int super_block_upper_left =
         ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
         ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
+
     if ((bsize != cm->seq_params.sb_size || skip == 0) &&
         super_block_upper_left) {
       assert(mbmi->current_qindex > 0);
-      int reduced_delta_qindex =
+      const int reduced_delta_qindex =
           (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res;
       write_delta_qindex(xd, reduced_delta_qindex, w);
       xd->current_qindex = mbmi->current_qindex;
@@ -996,37 +975,96 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
       }
     }
   }
+}
 
-  if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
+static void write_intra_prediction_modes(AV1_COMP *cpi, const int mi_row,
+                                         const int mi_col, int is_keyframe,
+                                         aom_writer *w) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const PREDICTION_MODE mode = mbmi->mode;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
 
-  if (mbmi->skip_mode) return;
+  // Y mode.
+  if (is_keyframe) {
+    const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+    const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+    write_intra_y_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w);
+  } else {
+    write_intra_y_mode_nonkf(ec_ctx, bsize, mode, w);
+  }
 
-  if (!is_inter) {
-    write_intra_mode(ec_ctx, bsize, mode, w);
-    const int use_angle_delta = av1_use_angle_delta(bsize);
+  // Y angle delta.
+  const int use_angle_delta = av1_use_angle_delta(bsize);
+  if (use_angle_delta && av1_is_directional_mode(mode)) {
+    write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y],
+                      ec_ctx->angle_delta_cdf[mode - V_PRED]);
+  }
 
-    if (use_angle_delta && av1_is_directional_mode(mode)) {
-      write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y],
-                        ec_ctx->angle_delta_cdf[mode - V_PRED]);
+  // UV mode and UV angle delta.
+  if (!cm->seq_params.monochrome &&
+      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                          xd->plane[1].subsampling_y)) {
+    const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+    write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
+    if (uv_mode == UV_CFL_PRED)
+      write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
+    if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) {
+      write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
+                        ec_ctx->angle_delta_cdf[uv_mode - V_PRED]);
     }
+  }
 
-    if (!cm->seq_params.monochrome &&
-        is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                            xd->plane[1].subsampling_y)) {
-      const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
-      write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
-      if (uv_mode == UV_CFL_PRED)
-        write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
-      if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) {
-        write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
-                          ec_ctx->angle_delta_cdf[uv_mode - V_PRED]);
-      }
-    }
+  // Palette.
+  if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) {
+    write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
+  }
 
-    if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
-      write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
+  // Filter intra.
+  write_filter_intra_mode_info(cm, xd, mbmi, w);
+}
 
-    write_filter_intra_mode_info(cm, xd, mbmi, w);
+static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
+                                const int mi_col, aom_writer *w) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const struct segmentation *const seg = &cm->seg;
+  struct segmentation_probs *const segp = &ec_ctx->seg;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const PREDICTION_MODE mode = mbmi->mode;
+  const int segment_id = mbmi->segment_id;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int allow_hp = cm->allow_high_precision_mv;
+  const int is_inter = is_inter_block(mbmi);
+  const int is_compound = has_second_ref(mbmi);
+  int ref;
+
+  write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, 0, 1);
+
+  write_skip_mode(cm, xd, segment_id, mbmi, w);
+
+  assert(IMPLIES(mbmi->skip_mode, mbmi->skip));
+  const int skip =
+      mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w);
+
+  write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, skip, 0);
+
+  write_cdef(cm, xd, w, skip, mi_col, mi_row);
+
+  write_delta_q_params(cpi, mi_row, mi_col, skip, w);
+
+  if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
+
+  if (mbmi->skip_mode) return;
+
+  if (!is_inter) {
+    write_intra_prediction_modes(cpi, mi_row, mi_col, 0, w);
   } else {
     int16_t mode_ctx;
 
@@ -1172,11 +1210,7 @@ static void write_mb_modes_kf(AV1_COMP *cpi, MACROBLOCKD *xd,
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   const struct segmentation *const seg = &cm->seg;
   struct segmentation_probs *const segp = &ec_ctx->seg;
-  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const PREDICTION_MODE mode = mbmi->mode;
 
   if (seg->segid_preskip && seg->update_map)
     write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
@@ -1188,69 +1222,14 @@ static void write_mb_modes_kf(AV1_COMP *cpi, MACROBLOCKD *xd,
 
   write_cdef(cm, xd, w, skip, mi_col, mi_row);
 
-  if (cm->delta_q_present_flag) {
-    int super_block_upper_left =
-        ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
-        ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
-    if ((bsize != cm->seq_params.sb_size || skip == 0) &&
-        super_block_upper_left) {
-      assert(mbmi->current_qindex > 0);
-      int reduced_delta_qindex =
-          (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res;
-      write_delta_qindex(xd, reduced_delta_qindex, w);
-      xd->current_qindex = mbmi->current_qindex;
-      if (cm->delta_lf_present_flag) {
-        if (cm->delta_lf_multi) {
-          const int frame_lf_count =
-              av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
-          for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
-            int reduced_delta_lflevel =
-                (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
-                cm->delta_lf_res;
-            write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w);
-            xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
-          }
-        } else {
-          int reduced_delta_lflevel =
-              (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
-              cm->delta_lf_res;
-          write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w);
-          xd->delta_lf_from_base = mbmi->delta_lf_from_base;
-        }
-      }
-    }
-  }
+  write_delta_q_params(cpi, mi_row, mi_col, skip, w);
 
   if (av1_allow_intrabc(cm)) {
     write_intrabc_info(xd, mbmi_ext, w);
     if (is_intrabc_block(mbmi)) return;
   }
 
-  write_intra_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w);
-
-  const int use_angle_delta = av1_use_angle_delta(bsize);
-  if (use_angle_delta && av1_is_directional_mode(mode)) {
-    write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y],
-                      ec_ctx->angle_delta_cdf[mode - V_PRED]);
-  }
-
-  if (!cm->seq_params.monochrome &&
-      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                          xd->plane[1].subsampling_y)) {
-    const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
-    write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
-    if (uv_mode == UV_CFL_PRED)
-      write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
-    if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) {
-      write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
-                        ec_ctx->angle_delta_cdf[uv_mode - V_PRED]);
-    }
-  }
-
-  if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
-    write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
-
-  write_filter_intra_mode_info(cm, xd, mbmi, w);
+  write_intra_prediction_modes(cpi, mi_row, mi_col, 1, w);
 }
 
 #if CONFIG_RD_DEBUG
@@ -1549,10 +1528,10 @@ static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
           write_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, w);
     } else {
       write_selected_tx_size(xd, w);
-      set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, 0, xd);
+      set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h, 0, xd);
     }
   } else {
-    set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h,
+    set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h,
                   skip && is_inter_block(mbmi), xd);
   }
 
@@ -1694,15 +1673,14 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
 }
 
 static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
-                        aom_writer *const w, const TOKENEXTRA **tok,
-                        const TOKENEXTRA *const tok_end) {
+                        aom_writer *const w, int tile_row, int tile_col) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   const int mi_row_start = tile->mi_row_start;
   const int mi_row_end = tile->mi_row_end;
   const int mi_col_start = tile->mi_col_start;
   const int mi_col_end = tile->mi_col_end;
-  int mi_row, mi_col;
+  int mi_row, mi_col, sb_row_in_tile;
 
   av1_zero_above_context(cm, xd, mi_col_start, mi_col_end, tile->tile_row);
   av1_init_above_context(cm, xd, tile->tile_row);
@@ -1716,13 +1694,21 @@ static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
 
   for (mi_row = mi_row_start; mi_row < mi_row_end;
        mi_row += cm->seq_params.mib_size) {
+    sb_row_in_tile =
+        (mi_row - tile->mi_row_start) >> cm->seq_params.mib_size_log2;
+    const TOKENEXTRA *tok =
+        cpi->tplist[tile_row][tile_col][sb_row_in_tile].start;
+    const TOKENEXTRA *tok_end =
+        tok + cpi->tplist[tile_row][tile_col][sb_row_in_tile].count;
+
     av1_zero_left_context(xd);
 
     for (mi_col = mi_col_start; mi_col < mi_col_end;
          mi_col += cm->seq_params.mib_size) {
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col,
+      write_modes_sb(cpi, tile, w, &tok, tok_end, mi_row, mi_col,
                      cm->seq_params.sb_size);
     }
+    assert(tok == cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop);
   }
 }
 
@@ -2220,33 +2206,12 @@ static void write_ext_tile_info(const AV1_COMMON *const cm,
   }
 }
 
-#if USE_GF16_MULTI_LAYER
-static int get_refresh_mask_gf16(AV1_COMP *cpi) {
-  if (cpi->common.frame_type == KEY_FRAME || frame_is_sframe(&cpi->common))
-    return 0xFF;
-
-  int refresh_mask = 0;
-
-  if (cpi->refresh_last_frame || cpi->refresh_golden_frame ||
-      cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame ||
-      cpi->refresh_alt_ref_frame) {
-    assert(cpi->refresh_fb_idx >= 0 && cpi->refresh_fb_idx < REF_FRAMES);
-    refresh_mask |= (1 << cpi->refresh_fb_idx);
-  }
-
-  return refresh_mask;
-}
-#endif  // USE_GF16_MULTI_LAYER
-
 static int get_refresh_mask(AV1_COMP *cpi) {
   if ((cpi->common.frame_type == KEY_FRAME && cpi->common.show_frame) ||
       frame_is_sframe(&cpi->common))
     return 0xFF;
 
   int refresh_mask = 0;
-#if USE_GF16_MULTI_LAYER
-  if (cpi->rc.baseline_gf_interval == 16) return get_refresh_mask_gf16(cpi);
-#endif  // USE_GF16_MULTI_LAYER
 
   // NOTE(zoeliu): When LAST_FRAME is to get refreshed, the decoder will be
   // notified to get LAST3_FRAME refreshed and then the virtual indexes for all
@@ -2281,8 +2246,13 @@ static int get_refresh_mask(AV1_COMP *cpi) {
     // Note: This is highly specific to the use of ARF as a forward reference,
     // and this needs to be generalized as other uses are implemented
     // (like RTC/temporal scalability).
-    return refresh_mask |
-           (cpi->refresh_golden_frame << cpi->ref_fb_idx[ALTREF_FRAME - 1]);
+
+    if (cpi->preserve_arf_as_gld) {
+      return refresh_mask;
+    } else {
+      return refresh_mask |
+             (cpi->refresh_golden_frame << cpi->ref_fb_idx[ALTREF_FRAME - 1]);
+    }
   } else {
     const int arf_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
     return refresh_mask |
@@ -2574,9 +2544,9 @@ static void write_film_grain_params(AV1_COMP *cpi,
 
   aom_wb_write_literal(wb, pars->random_seed, 16);
 
-  pars->random_seed += 3245;  // For film grain test vectors purposes
+  pars->random_seed += 3381;  // Changing random seed for film grain
   if (!pars->random_seed)     // Random seed should not be zero
-    pars->random_seed += 1735;
+    pars->random_seed += 7391;
   if (cm->frame_type == INTER_FRAME)
     aom_wb_write_bit(wb, pars->update_parameters);
   else
@@ -2685,7 +2655,8 @@ static void write_sb_size(SequenceHeader *seq_params,
   aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0);
 }
 
-void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb) {
+static void write_sequence_header(AV1_COMP *cpi,
+                                  struct aom_write_bit_buffer *wb) {
   AV1_COMMON *const cm = &cpi->common;
   SequenceHeader *seq_params = &cm->seq_params;
 
@@ -2695,8 +2666,10 @@ void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb) {
   int max_frame_height = cpi->oxcf.forced_max_frame_height
                              ? cpi->oxcf.forced_max_frame_height
                              : cpi->oxcf.height;
+  // max((int)ceil(log2(max_frame_width)), 1)
   const int num_bits_width =
       (max_frame_width > 1) ? get_msb(max_frame_width - 1) + 1 : 1;
+  // max((int)ceil(log2(max_frame_height)), 1)
   const int num_bits_height =
       (max_frame_height > 1) ? get_msb(max_frame_height - 1) + 1 : 1;
   assert(num_bits_width <= 16);
@@ -2954,7 +2927,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
     assert(cm->frame_type == KEY_FRAME);
   }
   if (!seq_params->reduced_still_picture_hdr) {
-    if (cm->show_existing_frame) {
+    if (encode_show_existing_frame(cm)) {
       RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
       const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show];
 
@@ -3254,14 +3227,14 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
   if (cm->base_qindex > 0) {
     aom_wb_write_bit(wb, cm->delta_q_present_flag);
     if (cm->delta_q_present_flag) {
-      aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_q_res) - 1, 2);
+      aom_wb_write_literal(wb, get_msb(cm->delta_q_res), 2);
       xd->current_qindex = cm->base_qindex;
       if (cm->allow_intrabc)
         assert(cm->delta_lf_present_flag == 0);
       else
         aom_wb_write_bit(wb, cm->delta_lf_present_flag);
       if (cm->delta_lf_present_flag) {
-        aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_lf_res) - 1, 2);
+        aom_wb_write_literal(wb, get_msb(cm->delta_lf_res), 2);
         aom_wb_write_bit(wb, cm->delta_lf_multi);
         av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
       }
@@ -3508,7 +3481,7 @@ static void write_bitstream_level(BitstreamLevel bl,
   aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS);
 }
 
-static uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
+uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
   AV1_COMMON *const cm = &cpi->common;
   struct aom_write_bit_buffer wb = { dst, 0 };
   uint32_t size = 0;
@@ -3619,7 +3592,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
   AV1_COMMON *const cm = &cpi->common;
   aom_writer mode_bc;
   int tile_row, tile_col;
-  TOKENEXTRA *(*const tok_buffers)[MAX_TILE_COLS] = cpi->tile_tok;
   TileBufferEnc(*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers;
   uint32_t total_size = 0;
   const int tile_cols = cm->tile_cols;
@@ -3684,8 +3656,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
 
       for (tile_row = 0; tile_row < tile_rows; tile_row++) {
         TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
-        const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
-        const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
         const int data_offset = have_tiles ? 4 : 0;
         const int tile_idx = tile_row * tile_cols + tile_col;
         TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
@@ -3703,8 +3673,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
         mode_bc.allow_update_cdf =
             mode_bc.allow_update_cdf && !cm->disable_cdf_update;
         aom_start_encode(&mode_bc, buf->data + data_offset);
-        write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
-        assert(tok == tok_end);
+        write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col);
         aom_stop_encode(&mode_bc);
         tile_size = mode_bc.pos;
         buf->size = tile_size;
@@ -3794,8 +3763,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
       const int tile_idx = tile_row * tile_cols + tile_col;
       TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
       TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
-      const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
-      const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
       int is_last_tile_in_tg = 0;
 
       if (new_tg) {
@@ -3847,7 +3814,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
       av1_reset_loop_restoration(&cpi->td.mb.e_mbd, num_planes);
 
       aom_start_encode(&mode_bc, dst + total_size);
-      write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
+      write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col);
       aom_stop_encode(&mode_bc);
       tile_size = mode_bc.pos;
       assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES);
@@ -3990,7 +3957,8 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
     data += obu_header_size + obu_payload_size + length_field_size;
   }
 
-  const int write_frame_header = (cm->num_tg > 1 || cm->show_existing_frame);
+  const int write_frame_header =
+      (cm->num_tg > 1 || encode_show_existing_frame(cm));
   struct aom_write_bit_buffer saved_wb;
   if (write_frame_header) {
     // Write Frame Header OBU.
@@ -4017,7 +3985,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
     saved_wb.bit_buffer += length_field_size;
   }
 
-  if (cm->show_existing_frame) {
+  if (encode_show_existing_frame(cm)) {
     data_size = 0;
   } else {
     //  Each tile group obu will be preceded by 4-byte size of the tile group
diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h
index 2047b6833..465ccaed5 100644
--- a/third_party/aom/av1/encoder/bitstream.h
+++ b/third_party/aom/av1/encoder/bitstream.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_BITSTREAM_H_
-#define AV1_ENCODER_BITSTREAM_H_
+#ifndef AOM_AV1_ENCODER_BITSTREAM_H_
+#define AOM_AV1_ENCODER_BITSTREAM_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -20,8 +20,13 @@ extern "C" {
 
 struct aom_write_bit_buffer;
 
-void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb);
+// Writes only the OBU Sequence Header payload, and returns the size of the
+// payload written to 'dst'. This function does not write the OBU header, the
+// optional extension, or the OBU size to 'dst'.
+uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst);
 
+// Writes the OBU header byte, and the OBU header extension byte when
+// 'obu_extension' is non-zero. Returns number of bytes written to 'dst'.
 uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
                           uint8_t *const dst);
 
@@ -32,8 +37,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size);
 
 static INLINE int av1_preserve_existing_gf(AV1_COMP *cpi) {
   // Do not swap gf and arf indices for internal overlay frames
-  return !cpi->multi_arf_allowed && cpi->rc.is_src_frame_alt_ref &&
-         !cpi->rc.is_src_frame_ext_arf;
+  return cpi->rc.is_src_frame_alt_ref && !cpi->rc.is_src_frame_ext_arf;
 }
 
 void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
@@ -44,4 +48,4 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_BITSTREAM_H_
+#endif  // AOM_AV1_ENCODER_BITSTREAM_H_
diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h
index 003e59e39..0bc5dea82 100644
--- a/third_party/aom/av1/encoder/block.h
+++ b/third_party/aom/av1/encoder/block.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_BLOCK_H_
-#define AV1_ENCODER_BLOCK_H_
+#ifndef AOM_AV1_ENCODER_BLOCK_H_
+#define AOM_AV1_ENCODER_BLOCK_H_
 
 #include "av1/common/entropymv.h"
 #include "av1/common/entropy.h"
@@ -170,6 +170,7 @@ typedef struct {
   InterpFilters filters;
   int_mv mv[2];
   int8_t ref_frames[2];
+  COMPOUND_TYPE comp_type;
 } INTERPOLATION_FILTER_STATS;
 
 typedef struct macroblock MACROBLOCK;
@@ -254,6 +255,19 @@ struct macroblock {
 
   PALETTE_BUFFER *palette_buffer;
 
+  CONV_BUF_TYPE *tmp_conv_dst;
+  uint8_t *tmp_obmc_bufs[2];
+
+  // buffer for hash value calculation of a block
+  // used only in av1_get_block_hash_value()
+  // [first hash/second hash]
+  // [two buffers used ping-pong]
+  uint32_t *hash_value_buffer[2][2];
+
+  CRC_CALCULATOR crc_calculator1;
+  CRC_CALCULATOR crc_calculator2;
+  int g_crc_initialized;
+
   // These define limits to motion vector components to prevent them
   // from extending outside the UMV borders
   MvLimits mv_limits;
@@ -344,7 +358,6 @@ struct macroblock {
 #if CONFIG_DIST_8X8
   int using_dist_8x8;
   aom_tune_metric tune_metric;
-  DECLARE_ALIGNED(16, int16_t, pred_luma[MAX_SB_SQUARE]);
 #endif  // CONFIG_DIST_8X8
   int comp_idx_cost[COMP_INDEX_CONTEXTS][2];
   int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2];
@@ -352,6 +365,8 @@ struct macroblock {
   int tx_search_prune[EXT_TX_SET_TYPES];
   int must_find_valid_partition;
   int tx_split_prune_flag;  // Flag to skip tx split RD search.
+  int recalc_luma_mc_data;  // Flag to indicate recalculation of MC data during
+                            // interpolation filter search
 };
 
 static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
@@ -400,8 +415,38 @@ static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) {
   return depth;
 }
 
+static INLINE void set_blk_skip(MACROBLOCK *x, int plane, int blk_idx,
+                                int skip) {
+  if (skip)
+    x->blk_skip[blk_idx] |= 1UL << plane;
+  else
+    x->blk_skip[blk_idx] &= ~(1UL << plane);
+#ifndef NDEBUG
+  // Set chroma planes to uninitialized states when luma is set to check if
+  // it will be set later
+  if (plane == 0) {
+    x->blk_skip[blk_idx] |= 1UL << (1 + 4);
+    x->blk_skip[blk_idx] |= 1UL << (2 + 4);
+  }
+
+  // Clear the initialization checking bit
+  x->blk_skip[blk_idx] &= ~(1UL << (plane + 4));
+#endif
+}
+
+static INLINE int is_blk_skip(MACROBLOCK *x, int plane, int blk_idx) {
+#ifndef NDEBUG
+  // Check if this is initialized
+  assert(!(x->blk_skip[blk_idx] & (1UL << (plane + 4))));
+
+  // The magic number is 0x77, this is to test if there is garbage data
+  assert((x->blk_skip[blk_idx] & 0x88) == 0);
+#endif
+  return (x->blk_skip[blk_idx] >> plane) & 1;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_BLOCK_H_
+#endif  // AOM_AV1_ENCODER_BLOCK_H_
diff --git a/third_party/aom/av1/encoder/blockiness.c b/third_party/aom/av1/encoder/blockiness.c
index 66dedd9ed..f7cff9e53 100644
--- a/third_party/aom/av1/encoder/blockiness.c
+++ b/third_party/aom/av1/encoder/blockiness.c
@@ -16,7 +16,6 @@
 #include "av1/common/common.h"
 #include "av1/common/filter.h"
 #include "aom/aom_integer.h"
-#include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c
index d6e556b93..57f59f304 100644
--- a/third_party/aom/av1/encoder/context_tree.c
+++ b/third_party/aom/av1/encoder/context_tree.c
@@ -175,14 +175,15 @@ void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
 }
 
 void av1_free_pc_tree(ThreadData *td, const int num_planes) {
-  const int tree_nodes_inc = 1024;
-
-  const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
-  int i;
-  for (i = 0; i < tree_nodes; ++i)
-    free_tree_contexts(&td->pc_tree[i], num_planes);
-  aom_free(td->pc_tree);
-  td->pc_tree = NULL;
+  if (td->pc_tree != NULL) {
+    const int tree_nodes_inc = 1024;
+    const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
+    for (int i = 0; i < tree_nodes; ++i) {
+      free_tree_contexts(&td->pc_tree[i], num_planes);
+    }
+    aom_free(td->pc_tree);
+    td->pc_tree = NULL;
+  }
 }
 
 void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h
index c05f48a7a..4efc34985 100644
--- a/third_party/aom/av1/encoder/context_tree.h
+++ b/third_party/aom/av1/encoder/context_tree.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_CONTEXT_TREE_H_
-#define AV1_ENCODER_CONTEXT_TREE_H_
+#ifndef AOM_AV1_ENCODER_CONTEXT_TREE_H_
+#define AOM_AV1_ENCODER_CONTEXT_TREE_H_
 
 #include "av1/common/blockd.h"
 #include "av1/encoder/block.h"
@@ -56,6 +56,8 @@ typedef struct {
   int hybrid_pred_diff;
   int comp_pred_diff;
   int single_pred_diff;
+  // Skip certain ref frames during RD search of rectangular partitions.
+  int skip_ref_frame_mask;
 
   // TODO(jingning) Use RD_COST struct here instead. This involves a boarder
   // scope of refactoring.
@@ -109,4 +111,4 @@ void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
 }  // extern "C"
 #endif
 
-#endif /* AV1_ENCODER_CONTEXT_TREE_H_ */
+#endif  // AOM_AV1_ENCODER_CONTEXT_TREE_H_
diff --git a/third_party/aom/av1/encoder/corner_detect.h b/third_party/aom/av1/encoder/corner_detect.h
index 0317db5b3..cab59a774 100644
--- a/third_party/aom/av1/encoder/corner_detect.h
+++ b/third_party/aom/av1/encoder/corner_detect.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_CORNER_DETECT_H_
-#define AV1_ENCODER_CORNER_DETECT_H_
+#ifndef AOM_AV1_ENCODER_CORNER_DETECT_H_
+#define AOM_AV1_ENCODER_CORNER_DETECT_H_
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -19,4 +19,4 @@
 int fast_corner_detect(unsigned char *buf, int width, int height, int stride,
                        int *points, int max_points);
 
-#endif  // AV1_ENCODER_CORNER_DETECT_H_
+#endif  // AOM_AV1_ENCODER_CORNER_DETECT_H_
diff --git a/third_party/aom/av1/encoder/corner_match.h b/third_party/aom/av1/encoder/corner_match.h
index 3b16f9efc..535d2faed 100644
--- a/third_party/aom/av1/encoder/corner_match.h
+++ b/third_party/aom/av1/encoder/corner_match.h
@@ -8,8 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AV1_ENCODER_CORNER_MATCH_H_
-#define AV1_ENCODER_CORNER_MATCH_H_
+#ifndef AOM_AV1_ENCODER_CORNER_MATCH_H_
+#define AOM_AV1_ENCODER_CORNER_MATCH_H_
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -30,4 +30,4 @@ int determine_correspondence(unsigned char *frm, int *frm_corners,
                              int height, int frm_stride, int ref_stride,
                              int *correspondence_pts);
 
-#endif  // AV1_ENCODER_CORNER_MATCH_H_
+#endif  // AOM_AV1_ENCODER_CORNER_MATCH_H_
diff --git a/third_party/aom/av1/encoder/cost.h b/third_party/aom/av1/encoder/cost.h
index 5de7765c5..af5b09837 100644
--- a/third_party/aom/av1/encoder/cost.h
+++ b/third_party/aom/av1/encoder/cost.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_COST_H_
-#define AV1_ENCODER_COST_H_
+#ifndef AOM_AV1_ENCODER_COST_H_
+#define AOM_AV1_ENCODER_COST_H_
 
 #include "aom_dsp/prob.h"
 #include "aom/aom_integer.h"
@@ -44,4 +44,4 @@ void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_COST_H_
+#endif  // AOM_AV1_ENCODER_COST_H_
diff --git a/third_party/aom/av1/encoder/dwt.h b/third_party/aom/av1/encoder/dwt.h
index 03318e5b7..37306c6a5 100644
--- a/third_party/aom/av1/encoder/dwt.h
+++ b/third_party/aom/av1/encoder/dwt.h
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#ifndef AOM_AV1_ENCODER_DWT_H_
+#define AOM_AV1_ENCODER_DWT_H_
+
 #include "av1/common/common.h"
 #include "av1/common/enums.h"
 
@@ -18,3 +21,5 @@ void av1_fdwt8x8(tran_low_t *input, tran_low_t *output, int stride);
 void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride,
                                int hbd);
 int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd);
+
+#endif  // AOM_AV1_ENCODER_DWT_H_
diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c
index 27ca53761..cb226c59e 100644
--- a/third_party/aom/av1/encoder/encodeframe.c
+++ b/third_party/aom/av1/encoder/encodeframe.c
@@ -40,11 +40,11 @@
 #include "av1/common/reconinter.h"
 #include "av1/common/seg_common.h"
 #include "av1/common/tile_common.h"
+#include "av1/common/warped_motion.h"
 
 #include "av1/encoder/aq_complexity.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/aq_variance.h"
-#include "av1/common/warped_motion.h"
 #include "av1/encoder/global_motion.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encodemb.h"
@@ -56,6 +56,7 @@
 #include "av1/encoder/partition_model_weights.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/tokenize.h"
 
@@ -348,8 +349,9 @@ static void reset_tx_size(MACROBLOCK *x, MB_MODE_INFO *mbmi,
   x->skip = 0;
 }
 
-static void update_state(const AV1_COMP *const cpi, TileDataEnc *tile_data,
-                         ThreadData *td, PICK_MODE_CONTEXT *ctx, int mi_row,
+static void update_state(const AV1_COMP *const cpi,
+                         const TileDataEnc *const tile_data, ThreadData *td,
+                         const PICK_MODE_CONTEXT *const ctx, int mi_row,
                          int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) {
   int i, x_idx, y;
   const AV1_COMMON *const cm = &cpi->common;
@@ -359,7 +361,7 @@ static void update_state(const AV1_COMP *const cpi, TileDataEnc *tile_data,
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
-  MB_MODE_INFO *mi = &ctx->mic;
+  const MB_MODE_INFO *const mi = &ctx->mic;
   MB_MODE_INFO *const mi_addr = xd->mi[0];
   const struct segmentation *const seg = &cm->seg;
   const int bw = mi_size_wide[mi->sb_type];
@@ -505,12 +507,12 @@ static int set_deltaq_rdmult(const AV1_COMP *const cpi, MACROBLOCKD *const xd) {
       cpi, cm->base_qindex + xd->delta_qindex + cm->y_dc_delta_q);
 }
 
-static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
                              MACROBLOCK *const x, int mi_row, int mi_col,
                              RD_STATS *rd_cost, PARTITION_TYPE partition,
                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                              int64_t best_rd) {
-  const AV1_COMMON *const cm = &cpi->common;
+  AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -522,6 +524,13 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
   const DELTAQ_MODE deltaq_mode = cpi->oxcf.deltaq_mode;
   int i, orig_rdmult;
 
+  if (best_rd < 0) {
+    ctx->rdcost = INT64_MAX;
+    ctx->skip = 0;
+    av1_invalid_rd_stats(rd_cost);
+    return;
+  }
+
   aom_clear_system_state();
 
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
@@ -588,9 +597,10 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
 
   if (aq_mode == VARIANCE_AQ) {
     if (cpi->vaq_refresh) {
-      const int energy =
-          bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize);
-      mbmi->segment_id = av1_vaq_segment_id(energy);
+      const int energy = bsize <= BLOCK_16X16
+                             ? x->mb_energy
+                             : av1_log_block_var(cpi, x, bsize);
+      mbmi->segment_id = energy;
     }
     x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
   } else if (aq_mode == COMPLEXITY_AQ) {
@@ -1407,8 +1417,8 @@ static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
 static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
                      ThreadData *td, TOKENEXTRA **tp, int mi_row, int mi_col,
                      RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                     PARTITION_TYPE partition, PICK_MODE_CONTEXT *ctx,
-                     int *rate) {
+                     PARTITION_TYPE partition,
+                     const PICK_MODE_CONTEXT *const ctx, int *rate) {
   TileInfo *const tile = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *xd = &x->e_mbd;
@@ -1691,7 +1701,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
 
   if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
     set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-    x->mb_energy = av1_block_energy(cpi, x, bsize);
+    x->mb_energy = av1_log_block_var(cpi, x, bsize);
   }
 
   if (do_partition_search &&
@@ -1728,7 +1738,20 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
       pc_tree->partitioning = partition;
     }
   }
-
+  for (int b = 0; b < 2; ++b) {
+    pc_tree->horizontal[b].skip_ref_frame_mask = 0;
+    pc_tree->vertical[b].skip_ref_frame_mask = 0;
+  }
+  for (int b = 0; b < 3; ++b) {
+    pc_tree->horizontala[b].skip_ref_frame_mask = 0;
+    pc_tree->horizontalb[b].skip_ref_frame_mask = 0;
+    pc_tree->verticala[b].skip_ref_frame_mask = 0;
+    pc_tree->verticalb[b].skip_ref_frame_mask = 0;
+  }
+  for (int b = 0; b < 4; ++b) {
+    pc_tree->horizontal4[b].skip_ref_frame_mask = 0;
+    pc_tree->vertical4[b].skip_ref_frame_mask = 0;
+  }
   switch (partition) {
     case PARTITION_NONE:
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
@@ -1741,7 +1764,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_row + hbs < cm->mi_rows) {
         RD_STATS tmp_rdc;
-        PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0];
+        const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
         av1_init_rd_stats(&tmp_rdc);
         update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
         encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
@@ -1765,7 +1788,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_col + hbs < cm->mi_cols) {
         RD_STATS tmp_rdc;
-        PICK_MODE_CONTEXT *ctx_v = &pc_tree->vertical[0];
+        const PICK_MODE_CONTEXT *const ctx_v = &pc_tree->vertical[0];
         av1_init_rd_stats(&tmp_rdc);
         update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1);
         encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
@@ -1812,7 +1835,8 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
     case PARTITION_HORZ_A:
     case PARTITION_HORZ_B:
     case PARTITION_HORZ_4:
-    case PARTITION_VERT_4: assert(0 && "Cannot handle extended partiton types");
+    case PARTITION_VERT_4:
+      assert(0 && "Cannot handle extended partition types");
     default: assert(0); break;
   }
 
@@ -2164,7 +2188,8 @@ static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
   memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
 }
 
-static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+static INLINE void load_pred_mv(MACROBLOCK *x,
+                                const PICK_MODE_CONTEXT *const ctx) {
   memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
 }
 
@@ -2221,12 +2246,11 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
 // Try searching for an encoding for the given subblock. Returns zero if the
 // rdcost is already too high (to tell the caller not to bother searching for
 // encodings of further subblocks)
-static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td,
-                           TileDataEnc *tile_data, TOKENEXTRA **tp,
-                           int is_first, int is_last, int mi_row, int mi_col,
-                           BLOCK_SIZE subsize, RD_STATS *best_rdc,
-                           RD_STATS *sum_rdc, RD_STATS *this_rdc,
-                           PARTITION_TYPE partition,
+static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td,
+                           TileDataEnc *tile_data, TOKENEXTRA **tp, int is_last,
+                           int mi_row, int mi_col, BLOCK_SIZE subsize,
+                           RD_STATS *best_rdc, RD_STATS *sum_rdc,
+                           RD_STATS *this_rdc, PARTITION_TYPE partition,
                            PICK_MODE_CONTEXT *prev_ctx,
                            PICK_MODE_CONTEXT *this_ctx) {
 #define RTS_X_RATE_NOCOEF_ARG
@@ -2236,25 +2260,20 @@ static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td,
 
   if (cpi->sf.adaptive_motion_search) load_pred_mv(x, prev_ctx);
 
-  // On the first time around, write the rd stats straight to sum_rdc. Also, we
-  // should treat sum_rdc as containing zeros (even if it doesn't) to avoid
-  // having to zero it at the start.
-  if (is_first) this_rdc = sum_rdc;
-  const int64_t spent_rdcost = is_first ? 0 : sum_rdc->rdcost;
-  const int64_t rdcost_remaining = best_rdc->rdcost - spent_rdcost;
+  const int64_t rdcost_remaining = best_rdc->rdcost == INT64_MAX
+                                       ? INT64_MAX
+                                       : (best_rdc->rdcost - sum_rdc->rdcost);
 
   rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc,
                    RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx,
                    rdcost_remaining);
 
-  if (!is_first) {
-    if (this_rdc->rate == INT_MAX) {
-      sum_rdc->rdcost = INT64_MAX;
-    } else {
-      sum_rdc->rate += this_rdc->rate;
-      sum_rdc->dist += this_rdc->dist;
-      sum_rdc->rdcost += this_rdc->rdcost;
-    }
+  if (this_rdc->rate == INT_MAX) {
+    sum_rdc->rdcost = INT64_MAX;
+  } else {
+    sum_rdc->rate += this_rdc->rate;
+    sum_rdc->dist += this_rdc->dist;
+    sum_rdc->rdcost += this_rdc->rdcost;
   }
 
   if (sum_rdc->rdcost >= RTS_MAX_RDCOST) return 0;
@@ -2271,7 +2290,7 @@ static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td,
 #undef RTS_MAX_RDCOST
 }
 
-static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td,
+static void rd_test_partition3(AV1_COMP *const cpi, ThreadData *td,
                                TileDataEnc *tile_data, TOKENEXTRA **tp,
                                PC_TREE *pc_tree, RD_STATS *best_rdc,
                                PICK_MODE_CONTEXT ctxs[3],
@@ -2284,13 +2303,16 @@ static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td,
   MACROBLOCKD *const xd = &x->e_mbd;
   RD_STATS sum_rdc, this_rdc;
 #define RTP_STX_TRY_ARGS
-
-  if (!rd_try_subblock(cpi, td, tile_data, tp, 1, 0, mi_row0, mi_col0, subsize0,
+  int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+  av1_init_rd_stats(&sum_rdc);
+  sum_rdc.rate = x->partition_cost[pl][partition];
+  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+  if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row0, mi_col0, subsize0,
                        best_rdc, &sum_rdc, &this_rdc,
                        RTP_STX_TRY_ARGS partition, ctx, &ctxs[0]))
     return;
 
-  if (!rd_try_subblock(cpi, td, tile_data, tp, 0, 0, mi_row1, mi_col1, subsize1,
+  if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row1, mi_col1, subsize1,
                        best_rdc, &sum_rdc, &this_rdc,
                        RTP_STX_TRY_ARGS partition, &ctxs[0], &ctxs[1]))
     return;
@@ -2302,15 +2324,13 @@ static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td,
   // difference (obviously) doesn't contribute to the error.
   const int try_block2 = 1;
   if (try_block2 &&
-      !rd_try_subblock(cpi, td, tile_data, tp, 0, 1, mi_row2, mi_col2, subsize2,
+      !rd_try_subblock(cpi, td, tile_data, tp, 1, mi_row2, mi_col2, subsize2,
                        best_rdc, &sum_rdc, &this_rdc,
                        RTP_STX_TRY_ARGS partition, &ctxs[1], &ctxs[2]))
     return;
 
   if (sum_rdc.rdcost >= best_rdc->rdcost) return;
 
-  int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-  sum_rdc.rate += x->partition_cost[pl][partition];
   sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 
   if (sum_rdc.rdcost >= best_rdc->rdcost) return;
@@ -2321,45 +2341,6 @@ static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td,
 #undef RTP_STX_TRY_ARGS
 }
 
-#if CONFIG_DIST_8X8
-static int64_t dist_8x8_yuv(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                            uint8_t *src_plane_8x8[MAX_MB_PLANE],
-                            uint8_t *dst_plane_8x8[MAX_MB_PLANE]) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int64_t dist_8x8, dist_8x8_uv, total_dist;
-  const int src_stride = x->plane[0].src.stride;
-  int plane;
-
-  const int dst_stride = xd->plane[0].dst.stride;
-  dist_8x8 =
-      av1_dist_8x8(cpi, x, src_plane_8x8[0], src_stride, dst_plane_8x8[0],
-                   dst_stride, BLOCK_8X8, 8, 8, 8, 8, x->qindex)
-      << 4;
-
-  // Compute chroma distortion for a luma 8x8 block
-  dist_8x8_uv = 0;
-
-  if (num_planes > 1) {
-    for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-      unsigned sse;
-      const int src_stride_uv = x->plane[plane].src.stride;
-      const int dst_stride_uv = xd->plane[plane].dst.stride;
-      const int ssx = xd->plane[plane].subsampling_x;
-      const int ssy = xd->plane[plane].subsampling_y;
-      const BLOCK_SIZE plane_bsize = get_plane_block_size(BLOCK_8X8, ssx, ssy);
-
-      cpi->fn_ptr[plane_bsize].vf(src_plane_8x8[plane], src_stride_uv,
-                                  dst_plane_8x8[plane], dst_stride_uv, &sse);
-      dist_8x8_uv += (int64_t)sse << 4;
-    }
-  }
-
-  return total_dist = dist_8x8 + dist_8x8_uv;
-}
-#endif  // CONFIG_DIST_8X8
-
 static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
   pc_tree->partitioning = PARTITION_NONE;
   pc_tree->cb_search_range = SEARCH_FULL_PLANE;
@@ -2372,7 +2353,7 @@ static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
   }
 }
 
-static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
+static void rd_pick_sqr_partition(AV1_COMP *const cpi, ThreadData *td,
                                   TileDataEnc *tile_data, TOKENEXTRA **tp,
                                   int mi_row, int mi_col, BLOCK_SIZE bsize,
                                   RD_STATS *rd_cost, int64_t best_rd,
@@ -2410,7 +2391,12 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
   (void)*tp_orig;
   (void)split_rd;
 
-  av1_zero(pc_tree->pc_tree_stats);
+  if (best_rd < 0) {
+    pc_tree->none.rdcost = INT64_MAX;
+    pc_tree->none.skip = 0;
+    av1_invalid_rd_stats(rd_cost);
+    return;
+  }
   pc_tree->pc_tree_stats.valid = 1;
 
   // Override partition costs at the edges of the frame in the same
@@ -2441,9 +2427,11 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
 
 #ifndef NDEBUG
   // Nothing should rely on the default value of this array (which is just
-  // leftover from encoding the previous block. Setting it to magic number
+  // leftover from encoding the previous block. Setting it to fixed pattern
   // when debugging.
-  memset(x->blk_skip, 234, sizeof(x->blk_skip));
+  // bit 0, 1, 2 are blk_skip of each plane
+  // bit 4, 5, 6 are initialization checking of each plane
+  memset(x->blk_skip, 0x77, sizeof(x->blk_skip));
 #endif  // NDEBUG
 
   assert(mi_size_wide[bsize] == mi_size_high[bsize]);
@@ -2456,19 +2444,35 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
 
   if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
-    x->mb_energy = av1_block_energy(cpi, x, bsize);
+    x->mb_energy = av1_log_block_var(cpi, x, bsize);
 
   xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
   xd->left_txfm_context =
       xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
   save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
 
+#if CONFIG_DIST_8X8
+  if (x->using_dist_8x8) {
+    if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8)
+      do_square_split = 0;
+  }
+#endif
+
   // PARTITION_NONE
   if (partition_none_allowed) {
-    if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
-
+    int pt_cost = 0;
+    if (bsize_at_least_8x8) {
+      pc_tree->partitioning = PARTITION_NONE;
+      pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
+                    ? partition_cost[PARTITION_NONE]
+                    : 0;
+    }
+    int64_t partition_rd_cost = RDCOST(x->rdmult, pt_cost, 0);
+    int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+                                     ? INT64_MAX
+                                     : (best_rdc.rdcost - partition_rd_cost);
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
-                     PARTITION_NONE, bsize, ctx_none, best_rdc.rdcost);
+                     PARTITION_NONE, bsize, ctx_none, best_remain_rdcost);
 
     pc_tree->pc_tree_stats.rdcost = ctx_none->rdcost;
     pc_tree->pc_tree_stats.skip = ctx_none->skip;
@@ -2476,9 +2480,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
     if (none_rd) *none_rd = this_rdc.rdcost;
     if (this_rdc.rate != INT_MAX) {
       if (bsize_at_least_8x8) {
-        const int pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
-                                ? partition_cost[PARTITION_NONE]
-                                : 0;
         this_rdc.rate += pt_cost;
         this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
       }
@@ -2520,17 +2521,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
   int64_t temp_best_rdcost = best_rdc.rdcost;
   pn_rdc = best_rdc;
 
-#if CONFIG_DIST_8X8
-  uint8_t *src_plane_8x8[MAX_MB_PLANE], *dst_plane_8x8[MAX_MB_PLANE];
-
-  if (x->using_dist_8x8 && bsize == BLOCK_8X8) {
-    for (int i = 0; i < MAX_MB_PLANE; i++) {
-      src_plane_8x8[i] = x->plane[i].src.buf;
-      dst_plane_8x8[i] = xd->plane[i].dst.buf;
-    }
-  }
-#endif  // CONFIG_DIST_8X8
-
   // PARTITION_SPLIT
   if (do_square_split) {
     int reached_last_index = 0;
@@ -2548,6 +2538,8 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
 
       pc_tree->split[idx]->index = idx;
       int64_t *p_split_rd = &split_rd[idx];
+      // TODO(Cherma) : Account for partition cost while passing best rd to
+      // rd_pick_sqr_partition()
       rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row + y_idx,
                             mi_col + x_idx, subsize, &this_rdc,
                             temp_best_rdcost - sum_rdc.rdcost,
@@ -2568,14 +2560,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
     }
     reached_last_index = (idx == 4);
 
-#if CONFIG_DIST_8X8
-    if (x->using_dist_8x8 && reached_last_index &&
-        sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
-      sum_rdc.dist = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
-      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-    }
-#endif  // CONFIG_DIST_8X8
-
     if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_SPLIT];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
@@ -2634,14 +2618,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
     }
   }
 
-#if CONFIG_DIST_8X8
-  if (x->using_dist_8x8 && best_rdc.rate < INT_MAX &&
-      best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) {
-    encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
-              pc_tree, NULL);
-  }
-#endif  // CONFIG_DIST_8X8
-
   if (bsize == cm->seq_params.sb_size) {
     assert(best_rdc.rate < INT_MAX);
     assert(best_rdc.dist < INT64_MAX);
@@ -2791,6 +2767,99 @@ static int ml_prune_2pass_split_partition(const PC_TREE_STATS *pc_tree_stats,
 }
 #undef FEATURE_SIZE
 
+static void ml_prune_rect_partition(const AV1_COMP *const cpi,
+                                    const MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                    int64_t best_rd, int64_t none_rd,
+                                    int64_t *split_rd,
+                                    int *const dst_prune_horz,
+                                    int *const dst_prune_vert) {
+  if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
+  best_rd = AOMMAX(best_rd, 1);
+  const NN_CONFIG *nn_config = NULL;
+  const float prob_thresholds[5] = { 0.01f, 0.01f, 0.004f, 0.002f, 0.002f };
+  float cur_thresh = 0.0f;
+  switch (bsize) {
+    case BLOCK_8X8:
+      nn_config = &av1_rect_partition_nnconfig_8;
+      cur_thresh = prob_thresholds[0];
+      break;
+    case BLOCK_16X16:
+      nn_config = &av1_rect_partition_nnconfig_16;
+      cur_thresh = prob_thresholds[1];
+      break;
+    case BLOCK_32X32:
+      nn_config = &av1_rect_partition_nnconfig_32;
+      cur_thresh = prob_thresholds[2];
+      break;
+    case BLOCK_64X64:
+      nn_config = &av1_rect_partition_nnconfig_64;
+      cur_thresh = prob_thresholds[3];
+      break;
+    case BLOCK_128X128:
+      nn_config = &av1_rect_partition_nnconfig_128;
+      cur_thresh = prob_thresholds[4];
+      break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config) return;
+  aom_clear_system_state();
+
+  // 1. Compute input features
+  float features[9];
+
+  // RD cost ratios
+  for (int i = 0; i < 5; i++) features[i] = 1.0f;
+  if (none_rd > 0 && none_rd < 1000000000)
+    features[0] = (float)none_rd / (float)best_rd;
+  for (int i = 0; i < 4; i++) {
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      features[1 + i] = (float)split_rd[i] / (float)best_rd;
+  }
+
+  // Variance ratios
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  int whole_block_variance;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    whole_block_variance = av1_high_get_sby_perpixel_variance(
+        cpi, &x->plane[0].src, bsize, xd->bd);
+  } else {
+    whole_block_variance =
+        av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  }
+  whole_block_variance = AOMMAX(whole_block_variance, 1);
+
+  int split_variance[4];
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+  struct buf_2d buf;
+  buf.stride = x->plane[0].src.stride;
+  const int bw = block_size_wide[bsize];
+  for (int i = 0; i < 4; ++i) {
+    const int x_idx = (i & 1) * bw / 2;
+    const int y_idx = (i >> 1) * bw / 2;
+    buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride;
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      split_variance[i] =
+          av1_high_get_sby_perpixel_variance(cpi, &buf, subsize, xd->bd);
+    } else {
+      split_variance[i] = av1_get_sby_perpixel_variance(cpi, &buf, subsize);
+    }
+  }
+
+  for (int i = 0; i < 4; i++)
+    features[5 + i] = (float)split_variance[i] / (float)whole_block_variance;
+
+  // 2. Do the prediction and prune 0-2 partitions based on their probabilities
+  float raw_scores[3] = { 0.0f };
+  av1_nn_predict(features, nn_config, raw_scores);
+  float probs[3] = { 0.0f };
+  av1_nn_softmax(raw_scores, probs, 3);
+
+  // probs[0] is the probability of the fact that both rectangular partitions
+  // are worse than current best_rd
+  if (probs[1] <= cur_thresh) (*dst_prune_horz) = 1;
+  if (probs[2] <= cur_thresh) (*dst_prune_vert) = 1;
+}
+
 // Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
 // considered.
 static void ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
@@ -2880,13 +2949,14 @@ static void ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
 #define FEATURES 18
 #define LABELS 4
 // Use a ML model to predict if horz4 and vert4 should be considered.
-static void ml_prune_4_partition(const AV1_COMP *const cpi,
-                                 const MACROBLOCK *const x, BLOCK_SIZE bsize,
-                                 int part_ctx, int64_t best_rd,
-                                 int64_t horz_rd[2], int64_t vert_rd[2],
-                                 int64_t split_rd[4],
+static void ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                                 BLOCK_SIZE bsize, int part_ctx,
+                                 int64_t best_rd, int64_t horz_rd[2],
+                                 int64_t vert_rd[2], int64_t split_rd[4],
                                  int *const partition_horz4_allowed,
-                                 int *const partition_vert4_allowed) {
+                                 int *const partition_vert4_allowed,
+                                 unsigned int pb_source_variance, int mi_row,
+                                 int mi_col) {
   if (best_rd >= 1000000000) return;
   const NN_CONFIG *nn_config = NULL;
   switch (bsize) {
@@ -2903,7 +2973,7 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi,
   float features[FEATURES];
   int feature_index = 0;
   features[feature_index++] = (float)part_ctx;
-  features[feature_index++] = (float)get_unsigned_bits(x->source_variance);
+  features[feature_index++] = (float)get_unsigned_bits(pb_source_variance);
 
   const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
   int sub_block_rdcost[8] = { 0 };
@@ -2937,6 +3007,8 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi,
   {
     BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
     BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
+                         av1_num_planes(&cpi->common));
     const int src_stride = x->plane[0].src.stride;
     const uint8_t *src = x->plane[0].src.buf;
     const MACROBLOCKD *const xd = &x->e_mbd;
@@ -2990,7 +3062,7 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi,
     }
   }
 
-  const float denom = (float)(x->source_variance + 1);
+  const float denom = (float)(pb_source_variance + 1);
   const float low_b = 0.1f;
   const float high_b = 10.0f;
   for (int i = 0; i < 4; ++i) {
@@ -3022,9 +3094,9 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi,
   // Make decisions based on the model scores.
   int thresh = max_score;
   switch (bsize) {
-    case BLOCK_16X16: thresh -= 400; break;
-    case BLOCK_32X32: thresh -= 400; break;
-    case BLOCK_64X64: thresh -= 100; break;
+    case BLOCK_16X16: thresh -= 500; break;
+    case BLOCK_32X32: thresh -= 500; break;
+    case BLOCK_64X64: thresh -= 200; break;
     default: break;
   }
   *partition_horz4_allowed = 0;
@@ -3039,10 +3111,73 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi,
 #undef FEATURES
 #undef LABELS
 
+#define FEATURES 4
+// ML-based partition search breakout.
+static int ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                               const MACROBLOCK *const x,
+                               const RD_STATS *const rd_stats,
+                               unsigned int pb_source_variance) {
+  const NN_CONFIG *nn_config = NULL;
+  int thresh = 0;
+  switch (bsize) {
+    case BLOCK_8X8:
+      nn_config = &av1_partition_breakout_nnconfig_8;
+      thresh = cpi->sf.ml_partition_search_breakout_thresh[0];
+      break;
+    case BLOCK_16X16:
+      nn_config = &av1_partition_breakout_nnconfig_16;
+      thresh = cpi->sf.ml_partition_search_breakout_thresh[1];
+      break;
+    case BLOCK_32X32:
+      nn_config = &av1_partition_breakout_nnconfig_32;
+      thresh = cpi->sf.ml_partition_search_breakout_thresh[2];
+      break;
+    case BLOCK_64X64:
+      nn_config = &av1_partition_breakout_nnconfig_64;
+      thresh = cpi->sf.ml_partition_search_breakout_thresh[3];
+      break;
+    case BLOCK_128X128:
+      nn_config = &av1_partition_breakout_nnconfig_128;
+      thresh = cpi->sf.ml_partition_search_breakout_thresh[4];
+      break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config || thresh < 0) return 0;
+
+  // Generate feature values.
+  float features[FEATURES];
+  int feature_index = 0;
+  aom_clear_system_state();
+
+  const int num_pels_log2 = num_pels_log2_lookup[bsize];
+  float rate_f = (float)AOMMIN(rd_stats->rate, INT_MAX);
+  rate_f = ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) *
+           rate_f;
+  features[feature_index++] = rate_f;
+
+  const float dist_f =
+      (float)(AOMMIN(rd_stats->dist, INT_MAX) >> num_pels_log2);
+  features[feature_index++] = dist_f;
+
+  features[feature_index++] = (float)pb_source_variance;
+
+  const int dc_q = (int)x->plane[0].dequant_QTX[0];
+  features[feature_index++] = (float)(dc_q * dc_q) / 256.0f;
+  assert(feature_index == FEATURES);
+
+  // Calculate score using the NN model.
+  float score = 0.0f;
+  av1_nn_predict(features, nn_config, &score);
+
+  // Make decision.
+  return (int)(score * 100) >= thresh;
+}
+#undef FEATURES
+
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
-static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
+static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
                               TileDataEnc *tile_data, TOKENEXTRA **tp,
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
                               RD_STATS *rd_cost, int64_t best_rd,
@@ -3068,6 +3203,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
       pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0];
 
   int do_rectangular_split = 1;
+  int64_t cur_none_rd = 0;
   int64_t split_rd[4] = { 0, 0, 0, 0 };
   int64_t horz_rd[2] = { 0, 0 };
   int64_t vert_rd[2] = { 0, 0 };
@@ -3077,6 +3213,12 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   int vert_ctx_is_ready = 0;
   BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
 
+  if (best_rd < 0) {
+    pc_tree->none.rdcost = INT64_MAX;
+    pc_tree->none.skip = 0;
+    av1_invalid_rd_stats(rd_cost);
+    return;
+  }
   if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0;
 
   // Override skipping rectangular partition operations for edge blocks
@@ -3129,9 +3271,11 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 
 #ifndef NDEBUG
   // Nothing should rely on the default value of this array (which is just
-  // leftover from encoding the previous block. Setting it to magic number
+  // leftover from encoding the previous block. Setting it to fixed pattern
   // when debugging.
-  memset(x->blk_skip, 234, sizeof(x->blk_skip));
+  // bit 0, 1, 2 are blk_skip of each plane
+  // bit 4, 5, 6 are initialization checking of each plane
+  memset(x->blk_skip, 0x77, sizeof(x->blk_skip));
 #endif  // NDEBUG
 
   assert(mi_size_wide[bsize] == mi_size_high[bsize]);
@@ -3143,7 +3287,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
 
   if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
-    x->mb_energy = av1_block_energy(cpi, x, bsize);
+    x->mb_energy = av1_log_block_var(cpi, x, bsize);
 
   if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
     const int cb_partition_search_ctrl =
@@ -3285,22 +3429,56 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   }
 #endif
 
+  // Ref frames picked in the [i_th] quarter subblock during square partition
+  // RD search. It may be used to prune ref frame selection of rect partitions.
+  int ref_frames_used[4] = {
+    0,
+  };
+
 BEGIN_PARTITION_SEARCH:
   if (x->must_find_valid_partition) {
     partition_none_allowed = has_rows && has_cols;
     partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
     partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
   }
+
+  // Partition block source pixel variance.
+  unsigned int pb_source_variance = UINT_MAX;
+
+#if CONFIG_DIST_8X8
+  if (x->using_dist_8x8) {
+    if (block_size_high[bsize] <= 8) partition_horz_allowed = 0;
+    if (block_size_wide[bsize] <= 8) partition_vert_allowed = 0;
+    if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8)
+      do_square_split = 0;
+  }
+#endif
+
   // PARTITION_NONE
   if (partition_none_allowed) {
+    int pt_cost = 0;
+    if (bsize_at_least_8x8) {
+      pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
+                    ? partition_cost[PARTITION_NONE]
+                    : 0;
+    }
+    int64_t partition_rd_cost = RDCOST(x->rdmult, pt_cost, 0);
+    int64_t best_remain_rdcost = (best_rdc.rdcost == INT64_MAX)
+                                     ? INT64_MAX
+                                     : (best_rdc.rdcost - partition_rd_cost);
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
-                     PARTITION_NONE, bsize, ctx_none, best_rdc.rdcost);
+                     PARTITION_NONE, bsize, ctx_none, best_remain_rdcost);
+    pb_source_variance = x->source_variance;
     if (none_rd) *none_rd = this_rdc.rdcost;
+    cur_none_rd = this_rdc.rdcost;
     if (this_rdc.rate != INT_MAX) {
+      if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+        const int ref_type = av1_ref_frame_type(ctx_none->mic.ref_frame);
+        for (int i = 0; i < 4; ++i) {
+          ref_frames_used[i] |= (1 << ref_type);
+        }
+      }
       if (bsize_at_least_8x8) {
-        const int pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
-                                ? partition_cost[PARTITION_NONE]
-                                : 0;
         this_rdc.rate += pt_cost;
         this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
       }
@@ -3318,16 +3496,29 @@ BEGIN_PARTITION_SEARCH:
         best_rdc = this_rdc;
         if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
 
-        // If all y, u, v transform blocks in this partition are skippable, and
-        // the dist & rate are within the thresholds, the partition search is
-        // terminated for current branch of the partition search tree.
-        // The dist & rate thresholds are set to 0 at speed 0 to disable the
-        // early termination at that speed.
-        if (!x->e_mbd.lossless[xd->mi[0]->segment_id] &&
-            (ctx_none->skippable && best_rdc.dist < dist_breakout_thr &&
-             best_rdc.rate < rate_breakout_thr)) {
-          do_square_split = 0;
-          do_rectangular_split = 0;
+        if ((do_square_split || do_rectangular_split) &&
+            !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) {
+          const int use_ml_based_breakout =
+              bsize <= cpi->sf.use_square_partition_only_threshold &&
+              bsize > BLOCK_4X4 && xd->bd == 8;
+          if (use_ml_based_breakout) {
+            if (ml_predict_breakout(cpi, bsize, x, &this_rdc,
+                                    pb_source_variance)) {
+              do_square_split = 0;
+              do_rectangular_split = 0;
+            }
+          }
+
+          // If all y, u, v transform blocks in this partition are skippable,
+          // and the dist & rate are within the thresholds, the partition
+          // search is terminated for current branch of the partition search
+          // tree. The dist & rate thresholds are set to 0 at speed 0 to
+          // disable the early termination at that speed.
+          if (best_rdc.dist < dist_breakout_thr &&
+              best_rdc.rate < rate_breakout_thr) {
+            do_square_split = 0;
+            do_rectangular_split = 0;
+          }
         }
 
 #if CONFIG_FP_MB_STATS
@@ -3384,24 +3575,14 @@ BEGIN_PARTITION_SEARCH:
   // store estimated motion vector
   if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
 
-#if CONFIG_DIST_8X8
-  uint8_t *src_plane_8x8[MAX_MB_PLANE], *dst_plane_8x8[MAX_MB_PLANE];
-
-  if (x->using_dist_8x8 && bsize == BLOCK_8X8) {
-    for (int i = 0; i < num_planes; i++) {
-      src_plane_8x8[i] = x->plane[i].src.buf;
-      dst_plane_8x8[i] = xd->plane[i].dst.buf;
-    }
-  }
-#endif  // CONFIG_DIST_8X8
-
   // PARTITION_SPLIT
   if (do_square_split) {
     av1_init_rd_stats(&sum_rdc);
-    int reached_last_index = 0;
     subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-    int idx;
+    sum_rdc.rate = partition_cost[PARTITION_SPLIT];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
 
+    int idx;
     for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) {
       const int x_idx = (idx & 1) * mi_step;
       const int y_idx = (idx >> 1) * mi_step;
@@ -3413,8 +3594,13 @@ BEGIN_PARTITION_SEARCH:
 
       pc_tree->split[idx]->index = idx;
       int64_t *p_split_rd = &split_rd[idx];
+      int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+                                       ? INT64_MAX
+                                       : (best_rdc.rdcost - sum_rdc.rdcost);
+      if (cpi->sf.prune_ref_frame_for_rect_partitions)
+        pc_tree->split[idx]->none.rate = INT_MAX;
       rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
-                        subsize, &this_rdc, best_rdc.rdcost - sum_rdc.rdcost,
+                        subsize, &this_rdc, best_remain_rdcost,
                         pc_tree->split[idx], p_split_rd);
 
       if (this_rdc.rate == INT_MAX) {
@@ -3424,11 +3610,16 @@ BEGIN_PARTITION_SEARCH:
         sum_rdc.rate += this_rdc.rate;
         sum_rdc.dist += this_rdc.dist;
         sum_rdc.rdcost += this_rdc.rdcost;
-
+        if (cpi->sf.prune_ref_frame_for_rect_partitions &&
+            pc_tree->split[idx]->none.rate != INT_MAX) {
+          const int ref_type =
+              av1_ref_frame_type(pc_tree->split[idx]->none.mic.ref_frame);
+          ref_frames_used[idx] |= (1 << ref_type);
+        }
         if (idx <= 1 && (bsize <= BLOCK_8X8 ||
                          pc_tree->split[idx]->partitioning == PARTITION_NONE)) {
-          MB_MODE_INFO *const mbmi = &(pc_tree->split[idx]->none.mic);
-          PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+          const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none.mic;
+          const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
           // Neither palette mode nor cfl predicted
           if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
             if (mbmi->uv_mode != UV_CFL_PRED) split_ctx_is_ready[idx] = 1;
@@ -3436,60 +3627,83 @@ BEGIN_PARTITION_SEARCH:
         }
       }
     }
-    reached_last_index = (idx == 4);
-
-#if CONFIG_DIST_8X8
-    if (x->using_dist_8x8 && reached_last_index &&
-        sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
-      int64_t dist_8x8;
-      dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
-#ifdef DEBUG_DIST_8X8
-      // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled
-      if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && 0 /*!CONFIG_CFL*/)
-        assert(sum_rdc.dist == dist_8x8);
-#endif  // DEBUG_DIST_8X8
-      sum_rdc.dist = dist_8x8;
-      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-    }
-#endif  // CONFIG_DIST_8X8
+    const int reached_last_index = (idx == 4);
 
     if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rate += partition_cost[PARTITION_SPLIT];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
         pc_tree->partitioning = PARTITION_SPLIT;
       }
-    } else if (cpi->sf.less_rectangular_check) {
+    } else if (cpi->sf.less_rectangular_check_level > 0) {
       // skip rectangular partition test when larger block size
       // gives better rd cost
-      do_rectangular_split &= !partition_none_allowed;
+      if (cpi->sf.less_rectangular_check_level == 2 || idx <= 2)
+        do_rectangular_split &= !partition_none_allowed;
     }
 
     restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }  // if (do_split)
 
+  pc_tree->horizontal[0].skip_ref_frame_mask = 0;
+  pc_tree->horizontal[1].skip_ref_frame_mask = 0;
+  pc_tree->vertical[0].skip_ref_frame_mask = 0;
+  pc_tree->vertical[1].skip_ref_frame_mask = 0;
+  if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+    int used_frames;
+    used_frames = ref_frames_used[0] | ref_frames_used[1];
+    if (used_frames) pc_tree->horizontal[0].skip_ref_frame_mask = ~used_frames;
+    used_frames = ref_frames_used[2] | ref_frames_used[3];
+    if (used_frames) pc_tree->horizontal[1].skip_ref_frame_mask = ~used_frames;
+    used_frames = ref_frames_used[0] | ref_frames_used[2];
+    if (used_frames) pc_tree->vertical[0].skip_ref_frame_mask = ~used_frames;
+    used_frames = ref_frames_used[1] | ref_frames_used[3];
+    if (used_frames) pc_tree->vertical[1].skip_ref_frame_mask = ~used_frames;
+  }
+
+  int prune_horz = 0;
+  int prune_vert = 0;
+  if (cpi->sf.ml_prune_rect_partition && !frame_is_intra_only(cm) &&
+      (partition_horz_allowed || partition_vert_allowed)) {
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
+    ml_prune_rect_partition(cpi, x, bsize, best_rdc.rdcost, cur_none_rd,
+                            split_rd, &prune_horz, &prune_vert);
+  }
+
   // PARTITION_HORZ
-  if (partition_horz_allowed &&
+  if (partition_horz_allowed && !prune_horz &&
       (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
     av1_init_rd_stats(&sum_rdc);
     subsize = get_partition_subsize(bsize, PARTITION_HORZ);
     if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
-        partition_none_allowed)
+        partition_none_allowed) {
       pc_tree->horizontal[0].pred_interp_filter =
           av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
-
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+    }
+    int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+                                     ? INT64_MAX
+                                     : (best_rdc.rdcost - sum_rdc.rdcost);
+    sum_rdc.rate = partition_cost[PARTITION_HORZ];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
                      PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
-                     best_rdc.rdcost);
-    horz_rd[0] = sum_rdc.rdcost;
+                     best_remain_rdcost);
+
+    if (this_rdc.rate == INT_MAX) {
+      sum_rdc.rdcost = INT64_MAX;
+    } else {
+      sum_rdc.rate += this_rdc.rate;
+      sum_rdc.dist += this_rdc.dist;
+      sum_rdc.rdcost += this_rdc.rdcost;
+    }
+    horz_rd[0] = this_rdc.rdcost;
 
     if (sum_rdc.rdcost < best_rdc.rdcost && has_rows) {
-      PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0];
-      MB_MODE_INFO *const mbmi = &(pc_tree->horizontal[0].mic);
-      PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+      const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
+      const MB_MODE_INFO *const mbmi = &pc_tree->horizontal[0].mic;
+      const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
       // Neither palette mode nor cfl predicted
       if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
         if (mbmi->uv_mode != UV_CFL_PRED) horz_ctx_is_ready = 1;
@@ -3501,24 +3715,15 @@ BEGIN_PARTITION_SEARCH:
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_h);
 
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
-          partition_none_allowed)
+          partition_none_allowed) {
         pc_tree->horizontal[1].pred_interp_filter =
             av1_extract_interp_filter(ctx_h->mic.interp_filters, 0);
-
+      }
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
                        PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
                        best_rdc.rdcost - sum_rdc.rdcost);
       horz_rd[1] = this_rdc.rdcost;
 
-#if CONFIG_DIST_8X8
-      if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
-        update_state(cpi, tile_data, td, &pc_tree->horizontal[1],
-                     mi_row + mi_step, mi_col, subsize, DRY_RUN_NORMAL);
-        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL,
-                          mi_row + mi_step, mi_col, subsize, NULL);
-      }
-#endif  // CONFIG_DIST_8X8
-
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
       } else {
@@ -3526,24 +3731,9 @@ BEGIN_PARTITION_SEARCH:
         sum_rdc.dist += this_rdc.dist;
         sum_rdc.rdcost += this_rdc.rdcost;
       }
-#if CONFIG_DIST_8X8
-      if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX &&
-          bsize == BLOCK_8X8) {
-        int64_t dist_8x8;
-        dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
-#ifdef DEBUG_DIST_8X8
-        // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled
-        if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && 0 /*!CONFIG_CFL*/)
-          assert(sum_rdc.dist == dist_8x8);
-#endif  // DEBUG_DIST_8X8
-        sum_rdc.dist = dist_8x8;
-        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-      }
-#endif  // CONFIG_DIST_8X8
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rate += partition_cost[PARTITION_HORZ];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
@@ -3555,7 +3745,7 @@ BEGIN_PARTITION_SEARCH:
   }
 
   // PARTITION_VERT
-  if (partition_vert_allowed &&
+  if (partition_vert_allowed && !prune_vert &&
       (do_rectangular_split || active_v_edge(cpi, mi_col, mi_step))) {
     av1_init_rd_stats(&sum_rdc);
     subsize = get_partition_subsize(bsize, PARTITION_VERT);
@@ -3563,18 +3753,31 @@ BEGIN_PARTITION_SEARCH:
     if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
 
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
-        partition_none_allowed)
+        partition_none_allowed) {
       pc_tree->vertical[0].pred_interp_filter =
           av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
-
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+    }
+    sum_rdc.rate = partition_cost[PARTITION_VERT];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+    int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+                                     ? INT64_MAX
+                                     : (best_rdc.rdcost - sum_rdc.rdcost);
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
                      PARTITION_VERT, subsize, &pc_tree->vertical[0],
-                     best_rdc.rdcost);
-    vert_rd[0] = sum_rdc.rdcost;
+                     best_remain_rdcost);
+
+    if (this_rdc.rate == INT_MAX) {
+      sum_rdc.rdcost = INT64_MAX;
+    } else {
+      sum_rdc.rate += this_rdc.rate;
+      sum_rdc.dist += this_rdc.dist;
+      sum_rdc.rdcost += this_rdc.rdcost;
+    }
+    vert_rd[0] = this_rdc.rdcost;
     const int64_t vert_max_rdcost = best_rdc.rdcost;
     if (sum_rdc.rdcost < vert_max_rdcost && has_cols) {
-      MB_MODE_INFO *const mbmi = &(pc_tree->vertical[0].mic);
-      PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+      const MB_MODE_INFO *const mbmi = &pc_tree->vertical[0].mic;
+      const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
       // Neither palette mode nor cfl predicted
       if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
         if (mbmi->uv_mode != UV_CFL_PRED) vert_ctx_is_ready = 1;
@@ -3587,24 +3790,15 @@ BEGIN_PARTITION_SEARCH:
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
 
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
-          partition_none_allowed)
+          partition_none_allowed) {
         pc_tree->vertical[1].pred_interp_filter =
             av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
-
+      }
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
                        PARTITION_VERT, subsize, &pc_tree->vertical[1],
                        best_rdc.rdcost - sum_rdc.rdcost);
       vert_rd[1] = this_rdc.rdcost;
 
-#if CONFIG_DIST_8X8
-      if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
-        update_state(cpi, tile_data, td, &pc_tree->vertical[1], mi_row,
-                     mi_col + mi_step, subsize, DRY_RUN_NORMAL);
-        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
-                          mi_col + mi_step, subsize, NULL);
-      }
-#endif  // CONFIG_DIST_8X8
-
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
       } else {
@@ -3612,25 +3806,9 @@ BEGIN_PARTITION_SEARCH:
         sum_rdc.dist += this_rdc.dist;
         sum_rdc.rdcost += this_rdc.rdcost;
       }
-#if CONFIG_DIST_8X8
-      if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX &&
-          bsize == BLOCK_8X8) {
-        int64_t dist_8x8;
-        dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
-#ifdef DEBUG_DIST_8X8
-        // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled
-        if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 &&
-            0 /* !CONFIG_CFL */)
-          assert(sum_rdc.dist == dist_8x8);
-#endif  // DEBUG_DIST_8X8
-        sum_rdc.dist = dist_8x8;
-        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-      }
-#endif  // CONFIG_DIST_8X8
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rate += partition_cost[PARTITION_VERT];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
@@ -3641,6 +3819,17 @@ BEGIN_PARTITION_SEARCH:
     restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
 
+  if (pb_source_variance == UINT_MAX) {
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      pb_source_variance = av1_high_get_sby_perpixel_variance(
+          cpi, &x->plane[0].src, bsize, xd->bd);
+    } else {
+      pb_source_variance =
+          av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+    }
+  }
+
   const int ext_partition_allowed =
       do_rectangular_split && bsize > BLOCK_8X8 && partition_none_allowed;
 
@@ -3649,15 +3838,26 @@ BEGIN_PARTITION_SEARCH:
   int horzab_partition_allowed = ext_partition_allowed;
   int vertab_partition_allowed = ext_partition_allowed;
 
+#if CONFIG_DIST_8X8
+  if (x->using_dist_8x8) {
+    if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8) {
+      horzab_partition_allowed = 0;
+      vertab_partition_allowed = 0;
+    }
+  }
+#endif
+
   if (cpi->sf.prune_ext_partition_types_search_level) {
     if (cpi->sf.prune_ext_partition_types_search_level == 1) {
+      // TODO(debargha,huisu@google.com): may need to tune the threshold for
+      // pb_source_variance.
       horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
                                    (pc_tree->partitioning == PARTITION_NONE &&
-                                    x->source_variance < 32) ||
+                                    pb_source_variance < 32) ||
                                    pc_tree->partitioning == PARTITION_SPLIT);
       vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
                                    (pc_tree->partitioning == PARTITION_NONE &&
-                                    x->source_variance < 32) ||
+                                    pb_source_variance < 32) ||
                                    pc_tree->partitioning == PARTITION_SPLIT);
     } else {
       horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
@@ -3712,6 +3912,9 @@ BEGIN_PARTITION_SEARCH:
 
   if (cpi->sf.ml_prune_ab_partition && ext_partition_allowed &&
       partition_horz_allowed && partition_vert_allowed) {
+    // TODO(huisu@google.com): x->source_variance may not be the current block's
+    // variance. The correct one to use is pb_source_variance.
+    // Need to re-train the model to fix it.
     ml_prune_ab_partition(bsize, pc_tree->partitioning,
                           get_unsigned_bits(x->source_variance),
                           best_rdc.rdcost, horz_rd, vert_rd, split_rd,
@@ -3736,6 +3939,21 @@ BEGIN_PARTITION_SEARCH:
         pc_tree->horizontala[1].rd_mode_is_ready = 1;
       }
     }
+    pc_tree->horizontala[0].skip_ref_frame_mask = 0;
+    pc_tree->horizontala[1].skip_ref_frame_mask = 0;
+    pc_tree->horizontala[2].skip_ref_frame_mask = 0;
+    if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+      int used_frames;
+      used_frames = ref_frames_used[0];
+      if (used_frames)
+        pc_tree->horizontala[0].skip_ref_frame_mask = ~used_frames;
+      used_frames = ref_frames_used[1];
+      if (used_frames)
+        pc_tree->horizontala[1].skip_ref_frame_mask = ~used_frames;
+      used_frames = ref_frames_used[2] | ref_frames_used[3];
+      if (used_frames)
+        pc_tree->horizontala[2].skip_ref_frame_mask = ~used_frames;
+    }
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize,
                        PARTITION_HORZ_A, mi_row, mi_col, bsize2, mi_row,
@@ -3754,6 +3972,21 @@ BEGIN_PARTITION_SEARCH:
       pc_tree->horizontalb[0].mic.partition = PARTITION_HORZ_B;
       pc_tree->horizontalb[0].rd_mode_is_ready = 1;
     }
+    pc_tree->horizontalb[0].skip_ref_frame_mask = 0;
+    pc_tree->horizontalb[1].skip_ref_frame_mask = 0;
+    pc_tree->horizontalb[2].skip_ref_frame_mask = 0;
+    if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+      int used_frames;
+      used_frames = ref_frames_used[0] | ref_frames_used[1];
+      if (used_frames)
+        pc_tree->horizontalb[0].skip_ref_frame_mask = ~used_frames;
+      used_frames = ref_frames_used[2];
+      if (used_frames)
+        pc_tree->horizontalb[1].skip_ref_frame_mask = ~used_frames;
+      used_frames = ref_frames_used[3];
+      if (used_frames)
+        pc_tree->horizontalb[2].skip_ref_frame_mask = ~used_frames;
+    }
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize,
                        PARTITION_HORZ_B, mi_row, mi_col, subsize,
@@ -3773,6 +4006,18 @@ BEGIN_PARTITION_SEARCH:
       pc_tree->verticala[0].mic.partition = PARTITION_VERT_A;
       pc_tree->verticala[0].rd_mode_is_ready = 1;
     }
+    pc_tree->verticala[0].skip_ref_frame_mask = 0;
+    pc_tree->verticala[1].skip_ref_frame_mask = 0;
+    pc_tree->verticala[2].skip_ref_frame_mask = 0;
+    if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+      int used_frames;
+      used_frames = ref_frames_used[0];
+      if (used_frames) pc_tree->verticala[0].skip_ref_frame_mask = ~used_frames;
+      used_frames = ref_frames_used[2];
+      if (used_frames) pc_tree->verticala[1].skip_ref_frame_mask = ~used_frames;
+      used_frames = ref_frames_used[1] | ref_frames_used[3];
+      if (used_frames) pc_tree->verticala[2].skip_ref_frame_mask = ~used_frames;
+    }
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->verticala, ctx_none, mi_row, mi_col, bsize,
                        PARTITION_VERT_A, mi_row, mi_col, bsize2,
@@ -3791,6 +4036,18 @@ BEGIN_PARTITION_SEARCH:
       pc_tree->verticalb[0].mic.partition = PARTITION_VERT_B;
       pc_tree->verticalb[0].rd_mode_is_ready = 1;
     }
+    pc_tree->verticalb[0].skip_ref_frame_mask = 0;
+    pc_tree->verticalb[1].skip_ref_frame_mask = 0;
+    pc_tree->verticalb[2].skip_ref_frame_mask = 0;
+    if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+      int used_frames;
+      used_frames = ref_frames_used[0] | ref_frames_used[2];
+      if (used_frames) pc_tree->verticalb[0].skip_ref_frame_mask = ~used_frames;
+      used_frames = ref_frames_used[1];
+      if (used_frames) pc_tree->verticalb[1].skip_ref_frame_mask = ~used_frames;
+      used_frames = ref_frames_used[3];
+      if (used_frames) pc_tree->verticalb[2].skip_ref_frame_mask = ~used_frames;
+    }
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize,
                        PARTITION_VERT_B, mi_row, mi_col, subsize, mi_row,
@@ -3823,9 +4080,19 @@ BEGIN_PARTITION_SEARCH:
       partition_horz_allowed && partition_vert_allowed) {
     ml_prune_4_partition(cpi, x, bsize, pc_tree->partitioning, best_rdc.rdcost,
                          horz_rd, vert_rd, split_rd, &partition_horz4_allowed,
-                         &partition_vert4_allowed);
+                         &partition_vert4_allowed, pb_source_variance, mi_row,
+                         mi_col);
   }
 
+#if CONFIG_DIST_8X8
+  if (x->using_dist_8x8) {
+    if (block_size_high[bsize] <= 16 || block_size_wide[bsize] <= 16) {
+      partition_horz4_allowed = 0;
+      partition_vert4_allowed = 0;
+    }
+  }
+#endif
+
   // PARTITION_HORZ_4
   if (partition_horz4_allowed && has_rows &&
       (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
@@ -3834,25 +4101,33 @@ BEGIN_PARTITION_SEARCH:
     PICK_MODE_CONTEXT *ctx_prev = ctx_none;
 
     subsize = get_partition_subsize(bsize, PARTITION_HORZ_4);
+    sum_rdc.rate = partition_cost[PARTITION_HORZ_4];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
 
     for (int i = 0; i < 4; ++i) {
-      int this_mi_row = mi_row + i * quarter_step;
+      const int this_mi_row = mi_row + i * quarter_step;
 
       if (i > 0 && this_mi_row >= cm->mi_rows) break;
 
       PICK_MODE_CONTEXT *ctx_this = &pc_tree->horizontal4[i];
 
       ctx_this->rd_mode_is_ready = 0;
-      if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 0), (i == 3),
-                           this_mi_row, mi_col, subsize, &best_rdc, &sum_rdc,
-                           &this_rdc, PARTITION_HORZ_4, ctx_prev, ctx_this))
+      ctx_this->skip_ref_frame_mask = 0;
+      if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+        const int used_frames = i <= 1
+                                    ? (ref_frames_used[0] | ref_frames_used[1])
+                                    : (ref_frames_used[2] | ref_frames_used[3]);
+        if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames;
+      }
+      if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), this_mi_row,
+                           mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc,
+                           PARTITION_HORZ_4, ctx_prev, ctx_this))
         break;
 
       ctx_prev = ctx_this;
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rate += partition_cost[PARTITION_HORZ_4];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
@@ -3870,16 +4145,25 @@ BEGIN_PARTITION_SEARCH:
     PICK_MODE_CONTEXT *ctx_prev = ctx_none;
 
     subsize = get_partition_subsize(bsize, PARTITION_VERT_4);
+    sum_rdc.rate = partition_cost[PARTITION_VERT_4];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
 
     for (int i = 0; i < 4; ++i) {
-      int this_mi_col = mi_col + i * quarter_step;
+      const int this_mi_col = mi_col + i * quarter_step;
 
       if (i > 0 && this_mi_col >= cm->mi_cols) break;
 
       PICK_MODE_CONTEXT *ctx_this = &pc_tree->vertical4[i];
 
       ctx_this->rd_mode_is_ready = 0;
-      if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 0), (i == 3), mi_row,
+      ctx_this->skip_ref_frame_mask = 0;
+      if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+        const int used_frames = i <= 1
+                                    ? (ref_frames_used[0] | ref_frames_used[2])
+                                    : (ref_frames_used[1] | ref_frames_used[3]);
+        if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames;
+      }
+      if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), mi_row,
                            this_mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc,
                            PARTITION_VERT_4, ctx_prev, ctx_this))
         break;
@@ -3888,7 +4172,6 @@ BEGIN_PARTITION_SEARCH:
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rate += partition_cost[PARTITION_VERT_4];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
@@ -3924,14 +4207,6 @@ BEGIN_PARTITION_SEARCH:
     }
   }
 
-#if CONFIG_DIST_8X8
-  if (x->using_dist_8x8 && best_rdc.rate < INT_MAX &&
-      best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) {
-    encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
-              pc_tree, NULL);
-  }
-#endif  // CONFIG_DIST_8X8
-
   if (bsize == cm->seq_params.sb_size) {
     assert(best_rdc.rate < INT_MAX);
     assert(best_rdc.dist < INT64_MAX);
@@ -3950,6 +4225,15 @@ static void init_first_partition_pass_stats_tables(
   }
 }
 
+// clear pc_tree_stats
+static INLINE void clear_pc_tree_stats(PC_TREE *pt) {
+  if (pt == NULL) return;
+  pt->pc_tree_stats.valid = 0;
+  for (int i = 0; i < 4; ++i) {
+    clear_pc_tree_stats(pt->split[i]);
+  }
+}
+
 // Minimum number of samples to trigger the
 // mode_pruning_based_on_two_pass_partition_search feature.
 #define FIRST_PARTITION_PASS_MIN_SAMPLES 16
@@ -3963,7 +4247,6 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   SPEED_FEATURES *const sf = &cpi->sf;
-  int mi_col;
   const int leaf_nodes = 256;
 
   // Initialize the left context for the new SB row
@@ -3977,26 +4260,16 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
     }
   }
 
+  PC_TREE *const pc_root =
+      td->pc_root[cm->seq_params.mib_size_log2 - MIN_MIB_SIZE_LOG2];
   // Code each SB in the row
-  for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
+  for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
        mi_col += cm->seq_params.mib_size) {
-    const struct segmentation *const seg = &cm->seg;
-    int dummy_rate;
-    int64_t dummy_dist;
-    RD_STATS dummy_rdc;
-    int i;
-    int seg_skip = 0;
-
-    const int idx_str = cm->mi_stride * mi_row + mi_col;
-    MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str;
-    PC_TREE *const pc_root =
-        td->pc_root[cm->seq_params.mib_size_log2 - MIN_MIB_SIZE_LOG2];
-
     av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes);
     av1_fill_mode_rates(cm, x, xd->tile_ctx);
 
     if (sf->adaptive_pred_interp_filter) {
-      for (i = 0; i < leaf_nodes; ++i) {
+      for (int i = 0; i < leaf_nodes; ++i) {
         td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
         td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
         td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
@@ -4015,10 +4288,12 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
     av1_zero(x->pred_mv);
     pc_root->index = 0;
 
+    const struct segmentation *const seg = &cm->seg;
+    int seg_skip = 0;
     if (seg->enabled) {
       const uint8_t *const map =
           seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      int segment_id =
+      const int segment_id =
           map ? get_segment_id(cm, map, cm->seq_params.sb_size, mi_row, mi_col)
               : 0;
       seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
@@ -4039,15 +4314,14 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
             cpi, block_wavelet_energy_level);
       } else {
         const int block_var_level =
-            av1_block_energy(cpi, x, cm->seq_params.sb_size);
+            av1_log_block_var(cpi, x, cm->seq_params.sb_size);
         x->sb_energy_level = block_var_level;
         offset_qindex =
             av1_compute_deltaq_from_energy_level(cpi, block_var_level);
       }
-      int qmask = ~(cm->delta_q_res - 1);
+      const int qmask = ~(cm->delta_q_res - 1);
       int current_qindex = clamp(cm->base_qindex + offset_qindex,
                                  cm->delta_q_res, 256 - cm->delta_q_res);
-
       current_qindex =
           ((current_qindex - cm->base_qindex + cm->delta_q_res / 2) & qmask) +
           cm->base_qindex;
@@ -4058,18 +4332,16 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
       xd->mi[0]->current_qindex = current_qindex;
       av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
       if (cpi->oxcf.deltaq_mode == DELTA_Q_LF) {
-        int j, k;
-        int lfmask = ~(cm->delta_lf_res - 1);
-        int delta_lf_from_base = offset_qindex / 2;
-        delta_lf_from_base =
-            ((delta_lf_from_base + cm->delta_lf_res / 2) & lfmask);
+        const int lfmask = ~(cm->delta_lf_res - 1);
+        const int delta_lf_from_base =
+            ((offset_qindex / 2 + cm->delta_lf_res / 2) & lfmask);
 
         // pre-set the delta lf for loop filter. Note that this value is set
         // before mi is assigned for each block in current superblock
-        for (j = 0; j < AOMMIN(cm->seq_params.mib_size, cm->mi_rows - mi_row);
-             j++) {
-          for (k = 0; k < AOMMIN(cm->seq_params.mib_size, cm->mi_cols - mi_col);
-               k++) {
+        for (int j = 0;
+             j < AOMMIN(cm->seq_params.mib_size, cm->mi_rows - mi_row); j++) {
+          for (int k = 0;
+               k < AOMMIN(cm->seq_params.mib_size, cm->mi_cols - mi_col); k++) {
             cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)]
                 .delta_lf_from_base =
                 clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
@@ -4085,19 +4357,24 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
       }
     }
 
+    int dummy_rate;
+    int64_t dummy_dist;
+    RD_STATS dummy_rdc;
+    const int idx_str = cm->mi_stride * mi_row + mi_col;
+    MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str;
     x->source_variance = UINT_MAX;
     if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
-      BLOCK_SIZE bsize;
       set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
-      bsize = seg_skip ? cm->seq_params.sb_size : sf->always_this_block_size;
+      const BLOCK_SIZE bsize =
+          seg_skip ? cm->seq_params.sb_size : sf->always_this_block_size;
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                        cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1,
                        pc_root);
     } else if (cpi->partition_search_skippable_frame) {
-      BLOCK_SIZE bsize;
       set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
-      bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
+      const BLOCK_SIZE bsize =
+          get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                        cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1,
@@ -4113,9 +4390,9 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
       reset_partition(pc_root, cm->seq_params.sb_size);
       x->use_cb_search_range = 0;
       init_first_partition_pass_stats_tables(x->first_partition_pass_stats);
+      // Do the first pass if we need two pass partition search
       if (cpi->sf.two_pass_partition_search &&
-          cpi->sf.use_square_partition_only_threshold <
-              cm->seq_params.sb_size &&
+          cpi->sf.use_square_partition_only_threshold > BLOCK_4X4 &&
           mi_row + mi_size_high[cm->seq_params.sb_size] < cm->mi_rows &&
           mi_col + mi_size_wide[cm->seq_params.sb_size] < cm->mi_cols &&
           cm->frame_type != KEY_FRAME) {
@@ -4123,6 +4400,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
         // Reset the stats tables.
         if (sf->mode_pruning_based_on_two_pass_partition_search)
           av1_zero(x->first_partition_pass_stats);
+        clear_pc_tree_stats(pc_root);
         rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row, mi_col,
                               cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
                               pc_root, NULL);
@@ -4130,7 +4408,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
 
         x->source_variance = UINT_MAX;
         if (sf->adaptive_pred_interp_filter) {
-          for (i = 0; i < leaf_nodes; ++i) {
+          for (int i = 0; i < leaf_nodes; ++i) {
             td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
             td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
             td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
@@ -4157,7 +4435,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
         x->use_cb_search_range = 1;
 
         if (sf->mode_pruning_based_on_two_pass_partition_search) {
-          for (i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
+          for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
             FIRST_PARTITION_PASS_STATS *const stat =
                 &x->first_partition_pass_stats[i];
             if (stat->sample_counts < FIRST_PARTITION_PASS_MIN_SAMPLES) {
@@ -4174,21 +4452,17 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
             }
           }
         }
-
-        rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
-                          cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
-                          pc_root, NULL);
-      } else {
-        rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
-                          cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
-                          pc_root, NULL);
       }
+
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+                        cm->seq_params.sb_size, &dummy_rdc, INT64_MAX, pc_root,
+                        NULL);
     }
 #if CONFIG_COLLECT_INTER_MODE_RD_STATS
     // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile.
     if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
         cm->tile_rows == 1) {
-      av1_inter_mode_data_fit(x->rdmult);
+      av1_inter_mode_data_fit(tile_data, x->rdmult);
     }
 #endif
   }
@@ -4233,6 +4507,32 @@ static TX_MODE select_tx_mode(const AV1_COMP *cpi) {
     return cpi->common.tx_mode;
 }
 
+void av1_alloc_tile_data(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  int tile_col, tile_row;
+
+  if (cpi->tile_data != NULL) aom_free(cpi->tile_data);
+  CHECK_MEM_ERROR(
+      cm, cpi->tile_data,
+      aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+  cpi->allocated_tiles = tile_cols * tile_rows;
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      TileDataEnc *const tile_data =
+          &cpi->tile_data[tile_row * tile_cols + tile_col];
+      int i, j;
+      for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+        for (j = 0; j < MAX_MODES; ++j) {
+          tile_data->thresh_freq_fact[i][j] = 32;
+          tile_data->mode_map[i][j] = j;
+        }
+      }
+    }
+}
+
 void av1_init_tile_data(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
@@ -4240,28 +4540,9 @@ void av1_init_tile_data(AV1_COMP *cpi) {
   const int tile_rows = cm->tile_rows;
   int tile_col, tile_row;
   TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
+  TOKENLIST *tplist = cpi->tplist[0][0];
   unsigned int tile_tok = 0;
-
-  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
-    if (cpi->tile_data != NULL) aom_free(cpi->tile_data);
-    CHECK_MEM_ERROR(
-        cm, cpi->tile_data,
-        aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data)));
-    cpi->allocated_tiles = tile_cols * tile_rows;
-
-    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
-      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-        TileDataEnc *const tile_data =
-            &cpi->tile_data[tile_row * tile_cols + tile_col];
-        int i, j;
-        for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
-          for (j = 0; j < MAX_MODES; ++j) {
-            tile_data->thresh_freq_fact[i][j] = 32;
-            tile_data->mode_map[i][j] = j;
-          }
-        }
-      }
-  }
+  int tplist_count = 0;
 
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
@@ -4274,6 +4555,9 @@ void av1_init_tile_data(AV1_COMP *cpi) {
       pre_tok = cpi->tile_tok[tile_row][tile_col];
       tile_tok = allocated_tokens(
           *tile_info, cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
+      cpi->tplist[tile_row][tile_col] = tplist + tplist_count;
+      tplist = cpi->tplist[tile_row][tile_col];
+      tplist_count = av1_get_sb_rows_in_tile(cm, tile_data->tile_info);
       tile_data->allow_update_cdf = !cm->large_scale_tile;
       tile_data->allow_update_cdf =
           tile_data->allow_update_cdf && !cm->disable_cdf_update;
@@ -4281,15 +4565,56 @@ void av1_init_tile_data(AV1_COMP *cpi) {
   }
 }
 
+void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row,
+                       int tile_col, int mi_row) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const int tile_cols = cm->tile_cols;
+  TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo *const tile_info = &this_tile->tile_info;
+  TOKENEXTRA *tok = NULL;
+  int sb_row_in_tile;
+  int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
+
+  int num_mb_rows_in_sb =
+      ((1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4;
+
+  sb_row_in_tile =
+      (mi_row - tile_info->mi_row_start) >> cm->seq_params.mib_size_log2;
+
+  get_start_tok(cpi, tile_row, tile_col, mi_row, &tok,
+                cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
+  cpi->tplist[tile_row][tile_col][sb_row_in_tile].start = tok;
+
+  encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+
+  cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop = tok;
+  cpi->tplist[tile_row][tile_col][sb_row_in_tile].count =
+      (unsigned int)(cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop -
+                     cpi->tplist[tile_row][tile_col][sb_row_in_tile].start);
+
+  assert(
+      (unsigned int)(tok -
+                     cpi->tplist[tile_row][tile_col][sb_row_in_tile].start) <=
+      get_token_alloc(num_mb_rows_in_sb, tile_mb_cols,
+                      cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes));
+
+  (void)tile_mb_cols;
+  (void)num_mb_rows_in_sb;
+}
+
 void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
                      int tile_col) {
   AV1_COMMON *const cm = &cpi->common;
   TileDataEnc *const this_tile =
       &cpi->tile_data[tile_row * cm->tile_cols + tile_col];
   const TileInfo *const tile_info = &this_tile->tile_info;
-  TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
   int mi_row;
 
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+  av1_inter_mode_data_init(this_tile);
+#endif
+
   av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start,
                          tile_info->mi_col_end, tile_row);
   av1_init_above_context(cm, &td->mb.e_mbd, tile_row);
@@ -4310,25 +4635,23 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
 
   for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
        mi_row += cm->seq_params.mib_size) {
-    encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+    av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
   }
-
-  cpi->tok_count[tile_row][tile_col] =
-      (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
-  assert(cpi->tok_count[tile_row][tile_col] <=
-         allocated_tokens(*tile_info,
-                          cm->seq_params.mib_size_log2 + MI_SIZE_LOG2,
-                          av1_num_planes(cm)));
 }
 
 static void encode_tiles(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
   int tile_col, tile_row;
 
+  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows)
+    av1_alloc_tile_data(cpi);
+
   av1_init_tile_data(cpi);
 
-  for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
-    for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
       av1_encode_tile(cpi, &cpi->td, tile_row, tile_col);
       cpi->intrabc_used |= cpi->td.intrabc_used_this_tile;
     }
@@ -4616,6 +4939,13 @@ static INLINE int skip_gm_frame(AV1_COMMON *const cm, int ref_frame) {
   return 0;
 }
 
+static void set_default_interp_skip_flags(AV1_COMP *cpi) {
+  const int num_planes = av1_num_planes(&cpi->common);
+  cpi->default_interp_skip_flags = (num_planes == 1)
+                                       ? DEFAULT_LUMA_INTERP_SKIP_FLAG
+                                       : DEFAULT_INTERP_SKIP_FLAG;
+}
+
 static void encode_frame_internal(AV1_COMP *cpi) {
   ThreadData *const td = &cpi->td;
   MACROBLOCK *const x = &td->mb;
@@ -4683,41 +5013,41 @@ static void encode_frame_internal(AV1_COMP *cpi) {
 
     av1_hash_table_create(&cm->cur_frame->hash_table);
     av1_generate_block_2x2_hash_value(cpi->source, block_hash_values[0],
-                                      is_block_same[0]);
+                                      is_block_same[0], &cpi->td.mb);
     av1_generate_block_hash_value(cpi->source, 4, block_hash_values[0],
                                   block_hash_values[1], is_block_same[0],
-                                  is_block_same[1]);
+                                  is_block_same[1], &cpi->td.mb);
     av1_add_to_hash_map_by_row_with_precal_data(
         &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
         pic_width, pic_height, 4);
     av1_generate_block_hash_value(cpi->source, 8, block_hash_values[1],
                                   block_hash_values[0], is_block_same[1],
-                                  is_block_same[0]);
+                                  is_block_same[0], &cpi->td.mb);
     av1_add_to_hash_map_by_row_with_precal_data(
         &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
         pic_width, pic_height, 8);
     av1_generate_block_hash_value(cpi->source, 16, block_hash_values[0],
                                   block_hash_values[1], is_block_same[0],
-                                  is_block_same[1]);
+                                  is_block_same[1], &cpi->td.mb);
     av1_add_to_hash_map_by_row_with_precal_data(
         &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
         pic_width, pic_height, 16);
     av1_generate_block_hash_value(cpi->source, 32, block_hash_values[1],
                                   block_hash_values[0], is_block_same[1],
-                                  is_block_same[0]);
+                                  is_block_same[0], &cpi->td.mb);
     av1_add_to_hash_map_by_row_with_precal_data(
         &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
         pic_width, pic_height, 32);
     av1_generate_block_hash_value(cpi->source, 64, block_hash_values[0],
                                   block_hash_values[1], is_block_same[0],
-                                  is_block_same[1]);
+                                  is_block_same[1], &cpi->td.mb);
     av1_add_to_hash_map_by_row_with_precal_data(
         &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
         pic_width, pic_height, 64);
 
     av1_generate_block_hash_value(cpi->source, 128, block_hash_values[1],
                                   block_hash_values[0], is_block_same[1],
-                                  is_block_same[0]);
+                                  is_block_same[0], &cpi->td.mb);
     av1_add_to_hash_map_by_row_with_precal_data(
         &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
         pic_width, pic_height, 128);
@@ -4769,7 +5099,7 @@ static void encode_frame_internal(AV1_COMP *cpi) {
   av1_initialize_rd_consts(cpi);
   av1_initialize_me_consts(cpi, x, cm->base_qindex);
   init_encode_frame_mb_context(cpi);
-
+  set_default_interp_skip_flags(cpi);
   if (cm->prev_frame)
     cm->last_frame_seg_map = cm->prev_frame->seg_map;
   else
@@ -4793,6 +5123,9 @@ static void encode_frame_internal(AV1_COMP *cpi) {
 
   av1_zero(rdc->global_motion_used);
   av1_zero(cpi->gmparams_cost);
+#if !CONFIG_GLOBAL_MOTION_SEARCH
+  cpi->global_motion_search_done = 1;
+#endif  // !CONFIG_GLOBAL_MOTION_SEARCH
   if (cpi->common.frame_type == INTER_FRAME && cpi->source &&
       !cpi->global_motion_search_done) {
     YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES];
@@ -4939,27 +5272,13 @@ static void encode_frame_internal(AV1_COMP *cpi) {
     }
 #endif
 
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
-    av1_inter_mode_data_init();
-#endif
-
-    // If allowed, encoding tiles in parallel with one thread handling one tile.
-    // TODO(geza.lore): The multi-threaded encoder is not safe with more than
-    // 1 tile rows, as it uses the single above_context et al arrays from
-    // cpi->common
-    if (AOMMIN(cpi->oxcf.max_threads, cm->tile_cols) > 1 && cm->tile_rows == 1)
+    if (cpi->row_mt && (cpi->oxcf.max_threads > 1))
+      av1_encode_tiles_mt(cpi);
+    else if (AOMMIN(cpi->oxcf.max_threads, cm->tile_cols * cm->tile_rows) > 1)
       av1_encode_tiles_mt(cpi);
     else
       encode_tiles(cpi);
 
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
-#if INTER_MODE_RD_TEST
-    if (cpi->sf.inter_mode_rd_model_estimation) {
-      av1_inter_mode_data_show(cm);
-    }
-#endif
-#endif
-
     aom_usec_timer_mark(&emr_timer);
     cpi->time_encode_sb_row += aom_usec_timer_elapsed(&emr_timer);
   }
@@ -5407,7 +5726,7 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
       tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4;
     }
     mbmi->tx_size = tx_size;
-    set_txfm_ctxs(tx_size, xd->n8_w, xd->n8_h,
+    set_txfm_ctxs(tx_size, xd->n4_w, xd->n4_h,
                   (mbmi->skip || seg_skip) && is_inter_block(mbmi), xd);
   }
   CFL_CTX *const cfl = &xd->cfl;
diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h
index 62141dba4..e8cf9b468 100644
--- a/third_party/aom/av1/encoder/encodeframe.h
+++ b/third_party/aom/av1/encoder/encodeframe.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_ENCODEFRAME_H_
-#define AV1_ENCODER_ENCODEFRAME_H_
+#ifndef AOM_AV1_ENCODER_ENCODEFRAME_H_
+#define AOM_AV1_ENCODER_ENCODEFRAME_H_
 
 #include "aom/aom_integer.h"
 #include "av1/common/blockd.h"
@@ -20,7 +20,7 @@
 extern "C" {
 #endif
 
-#define DELTAQ_MODULATION 0  // 0: variance based, 1: wavelet AC energy based
+#define DELTAQ_MODULATION 1  // 0: variance based, 1: wavelet AC energy based
 
 struct macroblock;
 struct yv12_buffer_config;
@@ -33,12 +33,15 @@ void av1_setup_src_planes(struct macroblock *x,
 
 void av1_encode_frame(struct AV1_COMP *cpi);
 
+void av1_alloc_tile_data(struct AV1_COMP *cpi);
 void av1_init_tile_data(struct AV1_COMP *cpi);
 void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row,
                      int tile_col);
+void av1_encode_sb_row(struct AV1_COMP *cpi, struct ThreadData *td,
+                       int tile_row, int tile_col, int mi_row);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_ENCODEFRAME_H_
+#endif  // AOM_AV1_ENCODER_ENCODEFRAME_H_
diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c
index cea8db6f9..ad12577e6 100644
--- a/third_party/aom/av1/encoder/encodemb.c
+++ b/third_party/aom/av1/encoder/encodemb.c
@@ -222,11 +222,8 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
 
   a = &args->ta[blk_col];
   l = &args->tl[blk_row];
-  // Assert not magic number (uninitialized).
-  assert(plane != 0 || x->blk_skip[blk_row * bw + blk_col] != 234);
 
-  if ((plane != 0 || x->blk_skip[blk_row * bw + blk_col] == 0) &&
-      !mbmi->skip_mode) {
+  if (!is_blk_skip(x, plane, blk_row * bw + blk_col) && !mbmi->skip_mode) {
     TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col,
                                       tx_size, cm->reduced_tx_set_used);
     if (args->enable_optimize_b) {
@@ -350,6 +347,66 @@ static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
   }
 }
 
+void av1_foreach_transformed_block_in_plane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+    foreach_transformed_block_visitor visit, void *arg) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  // transform size varies per plane, look it up in a common way.
+  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+  const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+  const uint8_t txh_unit = tx_size_high_unit[tx_size];
+  const int step = txw_unit * txh_unit;
+  int i = 0, r, c;
+
+  // If mb_to_right_edge is < 0 we are in a situation in which
+  // the current block size extends into the UMV and we won't
+  // visit the sub blocks that are wholly within the UMV.
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+
+  int blk_row, blk_col;
+
+  const BLOCK_SIZE max_unit_bsize =
+      get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
+  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+  mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+  mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (r = 0; r < max_blocks_high; r += mu_blocks_high) {
+    const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high);
+    // Skip visiting the sub blocks that are wholly within the UMV.
+    for (c = 0; c < max_blocks_wide; c += mu_blocks_wide) {
+      const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide);
+      for (blk_row = r; blk_row < unit_height; blk_row += txh_unit) {
+        for (blk_col = c; blk_col < unit_width; blk_col += txw_unit) {
+          visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg);
+          i += step;
+        }
+      }
+    }
+  }
+}
+
+void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
+                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                   foreach_transformed_block_visitor visit,
+                                   void *arg, const int num_planes) {
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (!is_chroma_reference(mi_row, mi_col, bsize,
+                             xd->plane[plane].subsampling_x,
+                             xd->plane[plane].subsampling_y))
+      continue;
+    av1_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
+  }
+}
+
 typedef struct encode_block_pass1_args {
   AV1_COMMON *cm;
   MACROBLOCK *x;
@@ -382,7 +439,7 @@ static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
     txfm_param.tx_set_type = av1_get_ext_tx_set_type(
         txfm_param.tx_size, is_inter_block(xd->mi[0]), cm->reduced_tx_set_used);
     if (txfm_param.is_hbd) {
-      av1_highbd_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride, &txfm_param);
+      av1_highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
       return;
     }
     av1_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
@@ -513,9 +570,7 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
   av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
 
   const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-  // Assert not magic number (uninitialized).
-  assert(plane != 0 || x->blk_skip[blk_row * bw + blk_col] != 234);
-  if (plane == 0 && x->blk_skip[blk_row * bw + blk_col]) {
+  if (plane == 0 && is_blk_skip(x, plane, blk_row * bw + blk_col)) {
     *eob = 0;
     p->txb_entropy_ctx[block] = 0;
   } else {
diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h
index 673f87ea7..39080de59 100644
--- a/third_party/aom/av1/encoder/encodemb.h
+++ b/third_party/aom/av1/encoder/encodemb.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_ENCODEMB_H_
-#define AV1_ENCODER_ENCODEMB_H_
+#ifndef AOM_AV1_ENCODER_ENCODEMB_H_
+#define AOM_AV1_ENCODER_ENCODEMB_H_
 
 #include "config/aom_config.h"
 
@@ -47,7 +47,18 @@ typedef enum AV1_XFORM_QUANT {
 
 void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                    int mi_row, int mi_col, RUN_TYPE dry_run);
+
+void av1_foreach_transformed_block_in_plane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+    foreach_transformed_block_visitor visit, void *arg);
+
+void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
+                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                   foreach_transformed_block_visitor visit,
+                                   void *arg, const int num_planes);
+
 void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize);
+
 void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
                      int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
                      TX_SIZE tx_size, TX_TYPE tx_type,
@@ -82,4 +93,4 @@ void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_ENCODEMB_H_
+#endif  // AOM_AV1_ENCODER_ENCODEMB_H_
diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c
index 944e2c53d..42eb5abf6 100644
--- a/third_party/aom/av1/encoder/encodemv.c
+++ b/third_party/aom/av1/encoder/encodemv.c
@@ -18,19 +18,37 @@
 #include "av1/encoder/encodemv.h"
 
 #include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/bitops.h"
+
+static INLINE int mv_class_base(MV_CLASS_TYPE c) {
+  return c ? CLASS0_SIZE << (c + 2) : 0;
+}
+
+// If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0.
+static INLINE uint8_t log_in_base_2(unsigned int n) {
+  // get_msb() is only valid when n != 0.
+  return n == 0 ? 0 : get_msb(n);
+}
+
+static INLINE MV_CLASS_TYPE get_mv_class(int z, int *offset) {
+  const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096)
+                              ? MV_CLASS_10
+                              : (MV_CLASS_TYPE)log_in_base_2(z >> 3);
+  if (offset) *offset = z - mv_class_base(c);
+  return c;
+}
 
 static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
                                 MvSubpelPrecision precision) {
+  assert(comp != 0);
   int offset;
   const int sign = comp < 0;
   const int mag = sign ? -comp : comp;
-  const int mv_class = av1_get_mv_class(mag - 1, &offset);
+  const int mv_class = get_mv_class(mag - 1, &offset);
   const int d = offset >> 3;         // int mv data
   const int fr = (offset >> 1) & 3;  // fractional mv data
   const int hp = offset & 1;         // high precision mv data
 
-  assert(comp != 0);
-
   // Sign
   aom_write_symbol(w, sign, mvcomp->sign_cdf, 2);
 
@@ -89,7 +107,7 @@ static void build_nmv_component_cost_table(int *mvcost,
   for (v = 1; v <= MV_MAX; ++v) {
     int z, c, o, d, e, f, cost = 0;
     z = v - 1;
-    c = av1_get_mv_class(z, &o);
+    c = get_mv_class(z, &o);
     cost += class_cost[c];
     d = (o >> 3);     /* int mv data */
     f = (o >> 1) & 3; /* fractional pel mv data */
diff --git a/third_party/aom/av1/encoder/encodemv.h b/third_party/aom/av1/encoder/encodemv.h
index 64e9e7162..37ff547c8 100644
--- a/third_party/aom/av1/encoder/encodemv.h
+++ b/third_party/aom/av1/encoder/encodemv.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_ENCODEMV_H_
-#define AV1_ENCODER_ENCODEMV_H_
+#ifndef AOM_AV1_ENCODER_ENCODEMV_H_
+#define AOM_AV1_ENCODER_ENCODEMV_H_
 
 #include "av1/encoder/encoder.h"
 
@@ -40,8 +40,16 @@ void av1_find_best_ref_mvs_from_stack(int allow_hp,
                                       int_mv *nearest_mv, int_mv *near_mv,
                                       int is_integer);
 
+static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
+  if (mv->row == 0) {
+    return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ;
+  } else {
+    return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ;
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_ENCODEMV_H_
+#endif  // AOM_AV1_ENCODER_ENCODEMV_H_
diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c
index 13ea32e38..a2da2df89 100644
--- a/third_party/aom/av1/encoder/encoder.c
+++ b/third_party/aom/av1/encoder/encoder.c
@@ -14,9 +14,28 @@
 #include <stdio.h>
 
 #include "config/aom_config.h"
-#include "config/av1_rtcd.h"
 #include "config/aom_dsp_rtcd.h"
 #include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#if CONFIG_DENOISE
+#include "aom_dsp/grain_table.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_dsp/noise_model.h"
+#endif
+#include "aom_dsp/psnr.h"
+#if CONFIG_INTERNAL_STATS
+#include "aom_dsp/ssim.h"
+#endif
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "aom_scale/aom_scale.h"
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 
 #include "av1/common/alloccommon.h"
 #include "av1/common/cdef.h"
@@ -38,6 +57,7 @@
 #include "av1/encoder/encodetxb.h"
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/firstpass.h"
+#include "av1/encoder/grain_test_vectors.h"
 #include "av1/encoder/hash_motion.h"
 #include "av1/encoder/mbgraph.h"
 #include "av1/encoder/picklpf.h"
@@ -49,26 +69,6 @@
 #include "av1/encoder/speed_features.h"
 #include "av1/encoder/temporal_filter.h"
 
-#include "aom_dsp/psnr.h"
-#if CONFIG_INTERNAL_STATS
-#include "aom_dsp/ssim.h"
-#endif
-#include "av1/encoder/grain_test_vectors.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#if CONFIG_DENOISE
-#include "aom_dsp/grain_table.h"
-#include "aom_dsp/noise_util.h"
-#include "aom_dsp/noise_model.h"
-#endif
-#include "aom_ports/aom_timer.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
-#include "aom_scale/aom_scale.h"
-#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
-#include "aom_util/debug_util.h"
-#endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
-
 #define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7
 
 // av1 uses 10,000,000 ticks/second as time stamp
@@ -413,18 +413,13 @@ static void swap_mi_and_prev_mi(AV1_COMMON *cm) {
 }
 
 void av1_initialize_enc(void) {
-  static volatile int init_done = 0;
-
-  if (!init_done) {
-    av1_rtcd();
-    aom_dsp_rtcd();
-    aom_scale_rtcd();
-    av1_init_intra_predictors();
-    av1_init_me_luts();
-    av1_rc_init_minq_luts();
-    av1_init_wedge_masks();
-    init_done = 1;
-  }
+  av1_rtcd();
+  aom_dsp_rtcd();
+  aom_scale_rtcd();
+  av1_init_intra_predictors();
+  av1_init_me_luts();
+  av1_rc_init_minq_luts();
+  av1_init_wedge_masks();
 }
 
 static void dealloc_context_buffers_ext(AV1_COMP *cpi) {
@@ -506,6 +501,11 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
   aom_free(cpi->td.mb.wsrc_buf);
   cpi->td.mb.wsrc_buf = NULL;
 
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++) {
+      aom_free(cpi->td.mb.hash_value_buffer[i][j]);
+      cpi->td.mb.hash_value_buffer[i][j] = NULL;
+    }
   aom_free(cpi->td.mb.mask_buf);
   cpi->td.mb.mask_buf = NULL;
 
@@ -527,10 +527,18 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
   aom_free(cpi->tile_tok[0][0]);
   cpi->tile_tok[0][0] = 0;
 
+  aom_free(cpi->tplist[0][0]);
+  cpi->tplist[0][0] = NULL;
+
   av1_free_pc_tree(&cpi->td, num_planes);
 
   aom_free(cpi->td.mb.palette_buffer);
 
+  aom_free(cpi->td.mb.tmp_conv_dst);
+  for (int j = 0; j < 2; ++j) {
+    aom_free(cpi->td.mb.tmp_obmc_bufs[j]);
+  }
+
 #if CONFIG_DENOISE
   if (cpi->denoise_and_model) {
     aom_denoise_and_model_free(cpi->denoise_and_model);
@@ -785,6 +793,10 @@ static void alloc_compressor_data(AV1_COMP *cpi) {
 
   av1_alloc_context_buffers(cm, cm->width, cm->height);
 
+  int mi_rows_aligned_to_sb =
+      ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
   av1_alloc_txb_buf(cpi);
 
   alloc_context_buffers_ext(cpi);
@@ -797,6 +809,11 @@ static void alloc_compressor_data(AV1_COMP *cpi) {
     CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
                     aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
   }
+  aom_free(cpi->tplist[0][0]);
+
+  CHECK_MEM_ERROR(cm, cpi->tplist[0][0],
+                  aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS,
+                             sizeof(*cpi->tplist[0][0])));
 
   av1_setup_pc_tree(&cpi->common, &cpi->td);
 }
@@ -1067,6 +1084,32 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
         10;  // Default value (not signaled)
   }
 
+  if (cm->seq_params.monochrome) {
+    cm->seq_params.subsampling_x = 1;
+    cm->seq_params.subsampling_y = 1;
+  } else if (cm->seq_params.color_primaries == AOM_CICP_CP_BT_709 &&
+             cm->seq_params.transfer_characteristics == AOM_CICP_TC_SRGB &&
+             cm->seq_params.matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+    cm->seq_params.subsampling_x = 0;
+    cm->seq_params.subsampling_y = 0;
+  } else {
+    if (cm->seq_params.profile == 0) {
+      cm->seq_params.subsampling_x = 1;
+      cm->seq_params.subsampling_y = 1;
+    } else if (cm->seq_params.profile == 1) {
+      cm->seq_params.subsampling_x = 0;
+      cm->seq_params.subsampling_y = 0;
+    } else {
+      if (cm->seq_params.bit_depth == AOM_BITS_12) {
+        cm->seq_params.subsampling_x = oxcf->chroma_subsampling_x;
+        cm->seq_params.subsampling_y = oxcf->chroma_subsampling_y;
+      } else {
+        cm->seq_params.subsampling_x = 1;
+        cm->seq_params.subsampling_y = 0;
+      }
+    }
+  }
+
   cm->width = oxcf->width;
   cm->height = oxcf->height;
   set_sb_size(&cm->seq_params,
@@ -2326,6 +2369,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
 
   cpi->oxcf = *oxcf;
   cpi->common.options = oxcf->cfg;
+  cpi->row_mt = oxcf->row_mt;
   x->e_mbd.bd = (int)seq_params->bit_depth;
   x->e_mbd.global_motion = cm->global_motion;
 
@@ -2350,6 +2394,22 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
     CHECK_MEM_ERROR(cm, x->palette_buffer,
                     aom_memalign(16, sizeof(*x->palette_buffer)));
   }
+
+  if (x->tmp_conv_dst == NULL) {
+    CHECK_MEM_ERROR(
+        cm, x->tmp_conv_dst,
+        aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*x->tmp_conv_dst)));
+    x->e_mbd.tmp_conv_dst = x->tmp_conv_dst;
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (x->tmp_obmc_bufs[i] == NULL) {
+      CHECK_MEM_ERROR(cm, x->tmp_obmc_bufs[i],
+                      aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                           sizeof(*x->tmp_obmc_bufs[i])));
+      x->e_mbd.tmp_obmc_bufs[i] = x->tmp_obmc_bufs[i];
+    }
+  }
+
   av1_reset_segment_features(cm);
   set_high_precision_mv(cpi, 1, 0);
 
@@ -2367,11 +2427,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   rc->worst_quality = cpi->oxcf.worst_allowed_q;
   rc->best_quality = cpi->oxcf.best_allowed_q;
 
-  if (!oxcf->large_scale_tile)
-    cm->interp_filter = cpi->sf.default_interp_filter;
-  else
-    cm->interp_filter = EIGHTTAP_REGULAR;
-
+  cm->interp_filter = oxcf->large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE;
   cm->switchable_motion_mode = 1;
 
   if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) {
@@ -2588,6 +2644,15 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
                   (int32_t *)aom_memalign(
                       16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.wsrc_buf)));
 
+  for (int x = 0; x < 2; x++)
+    for (int y = 0; y < 2; y++)
+      CHECK_MEM_ERROR(
+          cm, cpi->td.mb.hash_value_buffer[x][y],
+          (uint32_t *)aom_malloc(AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+                                 sizeof(*cpi->td.mb.hash_value_buffer[0][0])));
+
+  cpi->td.mb.g_crc_initialized = 0;
+
   CHECK_MEM_ERROR(cm, cpi->td.mb.mask_buf,
                   (int32_t *)aom_memalign(
                       16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf)));
@@ -2913,9 +2978,19 @@ void av1_remove_compressor(AV1_COMP *cpi) {
     // Deallocate allocated thread data.
     if (t < cpi->num_workers - 1) {
       aom_free(thread_data->td->palette_buffer);
+      aom_free(thread_data->td->tmp_conv_dst);
+      for (int j = 0; j < 2; ++j) {
+        aom_free(thread_data->td->tmp_obmc_bufs[j]);
+      }
       aom_free(thread_data->td->above_pred_buf);
       aom_free(thread_data->td->left_pred_buf);
       aom_free(thread_data->td->wsrc_buf);
+      for (int x = 0; x < 2; x++) {
+        for (int y = 0; y < 2; y++) {
+          aom_free(thread_data->td->hash_value_buffer[x][y]);
+          thread_data->td->hash_value_buffer[x][y] = NULL;
+        }
+      }
       aom_free(thread_data->td->mask_buf);
       aom_free(thread_data->td->counts);
       av1_free_pc_tree(thread_data->td, num_planes);
@@ -3058,53 +3133,7 @@ void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
 }
 #endif
 
-#if USE_GF16_MULTI_LAYER
-static void check_show_existing_frame_gf16(AV1_COMP *cpi) {
-  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-  AV1_COMMON *const cm = &cpi->common;
-  const FRAME_UPDATE_TYPE next_frame_update_type =
-      gf_group->update_type[gf_group->index];
-
-  if (cm->show_existing_frame == 1) {
-    cm->show_existing_frame = 0;
-  } else if (cpi->rc.is_last_bipred_frame) {
-    cpi->rc.is_last_bipred_frame = 0;
-    cm->show_existing_frame = 1;
-    cpi->existing_fb_idx_to_show = cpi->ref_fb_idx[BWDREF_FRAME - 1];
-  } else if (next_frame_update_type == OVERLAY_UPDATE ||
-             next_frame_update_type == INTNL_OVERLAY_UPDATE) {
-    // Check the temporal filtering status for the next OVERLAY frame
-    const int num_arfs_in_gf = cpi->num_extra_arfs + 1;
-    int which_arf = 0, arf_idx;
-    // Identify the index to the next overlay frame.
-    for (arf_idx = 0; arf_idx < num_arfs_in_gf; arf_idx++) {
-      if (gf_group->index == cpi->arf_pos_for_ovrly[arf_idx]) {
-        which_arf = arf_idx;
-        break;
-      }
-    }
-    assert(arf_idx < num_arfs_in_gf);
-    if (cpi->is_arf_filter_off[which_arf]) {
-      cm->show_existing_frame = 1;
-      cpi->rc.is_src_frame_alt_ref = 1;
-      cpi->existing_fb_idx_to_show = (next_frame_update_type == OVERLAY_UPDATE)
-                                         ? cpi->ref_fb_idx[ALTREF_FRAME - 1]
-                                         : cpi->ref_fb_idx[BWDREF_FRAME - 1];
-      cpi->is_arf_filter_off[which_arf] = 0;
-    }
-  }
-  cpi->rc.is_src_frame_ext_arf = 0;
-}
-#endif  // USE_GF16_MULTI_LAYER
-
 static void check_show_existing_frame(AV1_COMP *cpi) {
-#if USE_GF16_MULTI_LAYER
-  if (cpi->rc.baseline_gf_interval == 16) {
-    check_show_existing_frame_gf16(cpi);
-    return;
-  }
-#endif  // USE_GF16_MULTI_LAYER
-
   const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
   AV1_COMMON *const cm = &cpi->common;
   const FRAME_UPDATE_TYPE next_frame_update_type =
@@ -3350,13 +3379,13 @@ static INLINE void rshift_bwd_ref_frames(AV1_COMP *cpi) {
                                       EXTREF_FRAME - 1 };
 
   for (int i = 2; i > 0; --i) {
-    cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i - 1]];
-
     // [0] is allocated to the current coded frame, i.e. bwdref
     memcpy(
         cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME],
         cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME],
         sizeof(cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME]));
+
+    cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i - 1]];
   }
 }
 
@@ -3370,52 +3399,16 @@ static INLINE void lshift_bwd_ref_frames(AV1_COMP *cpi) {
                                       EXTREF_FRAME - 1 };
 
   for (int i = 0; i < 2; ++i) {
-    cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i + 1]];
-
     // [0] is allocated to the current coded frame, i.e. bwdref
     memcpy(
         cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME],
         cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME],
         sizeof(cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME]));
-  }
-}
-#endif  // USE_SYMM_MULTI_LAYER
 
-#if USE_GF16_MULTI_LAYER
-static void update_reference_frames_gf16(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  BufferPool *const pool = cm->buffer_pool;
-
-  if (cm->frame_type == KEY_FRAME) {
-    for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
-      ref_cnt_fb(pool->frame_bufs,
-                 &cm->ref_frame_map[cpi->ref_fb_idx[ref_frame]],
-                 cm->new_fb_idx);
-    }
-  } else {
-    if (cpi->refresh_last_frame || cpi->refresh_golden_frame ||
-        cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame ||
-        cpi->refresh_alt_ref_frame) {
-      assert(cpi->refresh_fb_idx >= 0 && cpi->refresh_fb_idx < REF_FRAMES);
-      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->refresh_fb_idx],
-                 cm->new_fb_idx);
-    }
-
-    // TODO(zoeliu): To handle cpi->interp_filter_selected[].
-
-    // For GF of 16, an additional ref frame index mapping needs to be handled
-    // if this is the last frame to encode in the current GF group.
-    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-    if (gf_group->update_type[gf_group->index + 1] == OVERLAY_UPDATE)
-      av1_ref_frame_map_idx_updates(cpi, gf_group->index + 1);
+    cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i + 1]];
   }
-
-#if DUMP_REF_FRAME_IMAGES == 1
-  // Dump out all reference frame images.
-  dump_ref_frame_images(cpi);
-#endif  // DUMP_REF_FRAME_IMAGES
 }
-#endif  // USE_GF16_MULTI_LAYER
+#endif  // USE_SYMM_MULTI_LAYER
 
 static void update_reference_frames(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
@@ -3424,12 +3417,20 @@ static void update_reference_frames(AV1_COMP *cpi) {
   //       for the purpose to verify no mismatch between encoder and decoder.
   if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx;
 
-#if USE_GF16_MULTI_LAYER
-  if (cpi->rc.baseline_gf_interval == 16) {
-    update_reference_frames_gf16(cpi);
-    return;
+  // In the case of show_existing frame, we will not send fresh flag
+  // to decoder. Any change in the reference frame buffer can be done by
+  // switching the virtual indices.
+  if (cm->show_existing_frame) {
+    cpi->refresh_last_frame = 0;
+    cpi->refresh_golden_frame = 0;
+    cpi->refresh_bwd_ref_frame = 0;
+    cpi->refresh_alt2_ref_frame = 0;
+    cpi->refresh_alt_ref_frame = 0;
+
+    cpi->rc.is_bwd_ref_frame = 0;
+    cpi->rc.is_last_bipred_frame = 0;
+    cpi->rc.is_bipred_frame = 0;
   }
-#endif  // USE_GF16_MULTI_LAYER
 
   BufferPool *const pool = cm->buffer_pool;
 
@@ -3458,9 +3459,15 @@ static void update_reference_frames(AV1_COMP *cpi) {
     // slot and, if we're updating the GF, the current frame becomes the new GF.
     int tmp;
 
-    ref_cnt_fb(pool->frame_bufs,
-               &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF_FRAME - 1]],
-               cm->new_fb_idx);
+    // ARF in general is a better reference than overlay. We shouldkeep ARF as
+    // reference instead of replacing it with overlay.
+
+    if (!cpi->preserve_arf_as_gld) {
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF_FRAME - 1]],
+                 cm->new_fb_idx);
+    }
+
     tmp = cpi->ref_fb_idx[ALTREF_FRAME - 1];
     cpi->ref_fb_idx[ALTREF_FRAME - 1] = cpi->ref_fb_idx[GOLDEN_FRAME - 1];
     cpi->ref_fb_idx[GOLDEN_FRAME - 1] = tmp;
@@ -3758,7 +3765,7 @@ static void set_size_independent_vars(AV1_COMP *cpi) {
   av1_set_speed_features_framesize_independent(cpi);
   av1_set_rd_speed_thresholds(cpi);
   av1_set_rd_speed_thresholds_sub8x8(cpi);
-  cpi->common.interp_filter = cpi->sf.default_interp_filter;
+  cpi->common.interp_filter = SWITCHABLE;
   cpi->common.switchable_motion_mode = 1;
 }
 
@@ -3818,7 +3825,8 @@ static void set_restoration_unit_size(int width, int height, int sx, int sy,
   rst[2].restoration_unit_size = rst[1].restoration_unit_size;
 }
 
-static void init_ref_frame_bufs(AV1_COMMON *cm) {
+static void init_ref_frame_bufs(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
   int i;
   BufferPool *const pool = cm->buffer_pool;
   cm->new_fb_idx = INVALID_IDX;
@@ -3828,7 +3836,7 @@ static void init_ref_frame_bufs(AV1_COMMON *cm) {
   }
   if (cm->seq_params.force_screen_content_tools) {
     for (i = 0; i < FRAME_BUFFERS; ++i) {
-      av1_hash_table_init(&pool->frame_bufs[i].hash_table);
+      av1_hash_table_init(&pool->frame_bufs[i].hash_table, &cpi->td.mb);
     }
   }
 }
@@ -3846,7 +3854,7 @@ static void check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
     seq_params->use_highbitdepth = use_highbitdepth;
 
     alloc_raw_frame_buffers(cpi);
-    init_ref_frame_bufs(cm);
+    init_ref_frame_bufs(cpi);
     alloc_util_frame_buffers(cpi);
 
     init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
@@ -4220,7 +4228,7 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
 
   if (lf->filter_level[0] || lf->filter_level[1]) {
 #if LOOP_FILTER_BITMASK
-    av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, num_planes, 0);
+    av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, 0, num_planes, 0);
 #else
     if (cpi->num_workers > 1)
       av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd, 0, num_planes, 0,
@@ -4587,8 +4595,8 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
         rc->projected_frame_size < rc->max_frame_bandwidth)
       loop = 0;
 
-    if (recode_loop_test_global_motion(cpi)) {
-      loop = 1;
+    if (!cpi->sf.gm_disable_recode) {
+      if (recode_loop_test_global_motion(cpi)) loop = 1;
     }
 
     if (loop) {
@@ -4716,47 +4724,6 @@ static void set_ext_overrides(AV1_COMP *cpi) {
       cpi->ext_use_error_resilient && cpi->common.frame_type != KEY_FRAME;
 }
 
-static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
-  InterpFilter ifilter;
-  int ref_total[REF_FRAMES] = { 0 };
-  MV_REFERENCE_FRAME ref;
-  int mask = 0;
-  int arf_idx = ALTREF_FRAME;
-
-  if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
-      cpi->refresh_alt2_ref_frame)
-    return mask;
-
-  for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref)
-    for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter)
-      ref_total[ref] += cpi->interp_filter_selected[ref][ifilter];
-
-  for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter) {
-    if ((ref_total[LAST_FRAME] &&
-         cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) &&
-        (ref_total[LAST2_FRAME] == 0 ||
-         cpi->interp_filter_selected[LAST2_FRAME][ifilter] * 50 <
-             ref_total[LAST2_FRAME]) &&
-        (ref_total[LAST3_FRAME] == 0 ||
-         cpi->interp_filter_selected[LAST3_FRAME][ifilter] * 50 <
-             ref_total[LAST3_FRAME]) &&
-        (ref_total[GOLDEN_FRAME] == 0 ||
-         cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50 <
-             ref_total[GOLDEN_FRAME]) &&
-        (ref_total[BWDREF_FRAME] == 0 ||
-         cpi->interp_filter_selected[BWDREF_FRAME][ifilter] * 50 <
-             ref_total[BWDREF_FRAME]) &&
-        (ref_total[ALTREF2_FRAME] == 0 ||
-         cpi->interp_filter_selected[ALTREF2_FRAME][ifilter] * 50 <
-             ref_total[ALTREF2_FRAME]) &&
-        (ref_total[ALTREF_FRAME] == 0 ||
-         cpi->interp_filter_selected[arf_idx][ifilter] * 50 <
-             ref_total[ALTREF_FRAME]))
-      mask |= 1 << ifilter;
-  }
-  return mask;
-}
-
 #define DUMP_RECON_FRAMES 0
 
 #if DUMP_RECON_FRAMES == 1
@@ -4914,7 +4881,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
   if (cm->current_video_frame > 0)
     cpi->ref_frame_flags = get_ref_frame_flags(cpi);
 
-  if (cm->show_existing_frame) {
+  if (encode_show_existing_frame(cm)) {
     // NOTE(zoeliu): In BIDIR_PRED, the existing frame to show is the current
     //               BWDREF_FRAME in the reference frame buffer.
     if (cm->frame_type == KEY_FRAME) {
@@ -4925,20 +4892,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
     cm->show_frame = 1;
     cpi->frame_flags = *frame_flags;
 
-    // In the case of show_existing frame, we will not send fresh flag
-    // to decoder. Any change in the reference frame buffer can be done by
-    // switching the virtual indices.
-
-    cpi->refresh_last_frame = 0;
-    cpi->refresh_golden_frame = 0;
-    cpi->refresh_bwd_ref_frame = 0;
-    cpi->refresh_alt2_ref_frame = 0;
-    cpi->refresh_alt_ref_frame = 0;
-
-    cpi->rc.is_bwd_ref_frame = 0;
-    cpi->rc.is_last_bipred_frame = 0;
-    cpi->rc.is_bipred_frame = 0;
-
     restore_coding_context(cpi);
 
     // Build the bitstream
@@ -4990,10 +4943,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
       av1_rc_postencode_update(cpi, *size);
     }
 
-    // Decrement count down till next gf
-    if (cpi->rc.frames_till_gf_update_due > 0)
-      cpi->rc.frames_till_gf_update_due--;
-
     ++cm->current_video_frame;
 
     return AOM_CODEC_OK;
@@ -5002,9 +4951,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
   // Set default state for segment based loop filter update flags.
   cm->lf.mode_ref_delta_update = 0;
 
-  if (cpi->oxcf.pass == 2 && cpi->sf.adaptive_interp_filter_search)
-    cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi);
-
   // Set various flags etc to special state if it is a key frame.
   if (frame_is_intra_only(cm) || frame_is_sframe(cm)) {
     // Reset the loop filter deltas and segmentation map.
@@ -5246,15 +5192,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
   // takes a space in the gf group. Therefore, even when
   // it is not shown, we still need update the count down.
 
-  // TODO(weitinglin): This is a work-around to handle the condition
-  // when a frame is drop. We should fix the cm->show_frame flag
-  // instead of checking the other condition to update the counter properly.
-  if (cm->show_frame || is_frame_droppable(cpi)) {
-    // Decrement count down till next gf
-    if (cpi->rc.frames_till_gf_update_due > 0)
-      cpi->rc.frames_till_gf_update_due--;
-  }
-
   if (cm->show_frame) {
     // TODO(zoeliu): We may only swamp mi and prev_mi for those frames that
     // are
@@ -5279,6 +5216,50 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
   return AOM_CODEC_OK;
 }
 
+static INLINE void update_keyframe_counters(AV1_COMP *cpi) {
+  // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
+  //               differently here for rc->avg_frame_bandwidth.
+  if (cpi->common.show_frame || cpi->rc.is_bwd_ref_frame) {
+    if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
+        cpi->common.frame_type == KEY_FRAME) {
+      // If this is a show_existing_frame with a source other than altref,
+      // or if it is not a displayed forward keyframe, the keyframe update
+      // counters were incremented when it was originally encoded.
+      cpi->rc.frames_since_key++;
+      cpi->rc.frames_to_key--;
+    }
+  }
+}
+
+static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
+  // TODO(weitinglin): Updating this counter for is_frame_droppable
+  // is a work-around to handle the condition when a frame is drop.
+  // We should fix the cpi->common.show_frame flag
+  // instead of checking the other condition to update the counter properly.
+  if (cpi->common.show_frame || is_frame_droppable(cpi)) {
+    // Decrement count down till next gf
+    if (cpi->rc.frames_till_gf_update_due > 0)
+      cpi->rc.frames_till_gf_update_due--;
+  }
+}
+
+static INLINE void update_twopass_gf_group_index(AV1_COMP *cpi) {
+  // Increment the gf group index ready for the next frame. If this is
+  // a show_existing_frame with a source other than altref, or if it is not
+  // a displayed forward keyframe, the index was incremented when it was
+  // originally encoded.
+  if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
+      cpi->common.frame_type == KEY_FRAME) {
+    ++cpi->twopass.gf_group.index;
+  }
+}
+
+static void update_rc_counts(AV1_COMP *cpi) {
+  update_keyframe_counters(cpi);
+  update_frames_till_gf_update(cpi);
+  if (cpi->oxcf.pass == 2) update_twopass_gf_group_index(cpi);
+}
+
 static int Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
                        int skip_adapt, unsigned int *frame_flags) {
   if (cpi->oxcf.rc_mode == AOM_CBR) {
@@ -5290,6 +5271,7 @@ static int Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
       AOM_CODEC_OK) {
     return AOM_CODEC_ERROR;
   }
+  update_rc_counts(cpi);
   check_show_existing_frame(cpi);
   return AOM_CODEC_OK;
 }
@@ -5319,14 +5301,8 @@ static int Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
           cm->cum_txcoeff_cost_timer);
 #endif
 
-  // Do not do post-encoding update for those frames that do not have a spot
-  // in
-  // a gf group, but note that an OVERLAY frame always has a spot in a gf
-  // group,
-  // even when show_existing_frame is used.
-  if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref) {
-    av1_twopass_postencode_update(cpi);
-  }
+  av1_twopass_postencode_update(cpi);
+  update_rc_counts(cpi);
   check_show_existing_frame(cpi);
   return AOM_CODEC_OK;
 }
@@ -5734,7 +5710,7 @@ static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture,
       av1_get_block_hash_value(
           cur_picture->y_buffer + y_pos * stride_cur + x_pos, stride_cur,
           block_size, &hash_value_1, &hash_value_2,
-          (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH));
+          (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH), &cpi->td.mb);
       // Hashing does not work for highbitdepth currently.
       // TODO(Roger): Make it work for highbitdepth.
       if (av1_use_hash_me(&cpi->common)) {
@@ -5822,13 +5798,6 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
 
   set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV, 0);
 
-  // Is multi-arf enabled.
-  // Note that at the moment multi_arf is only configured for 2 pass VBR
-  if ((oxcf->pass == 2) && (cpi->oxcf.enable_auto_arf > 1))
-    cpi->multi_arf_allowed = 1;
-  else
-    cpi->multi_arf_allowed = 0;
-
   // Normal defaults
   cm->refresh_frame_context = oxcf->frame_parallel_decoding_mode
                                   ? REFRESH_FRAME_CONTEXT_DISABLED
@@ -5850,16 +5819,20 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   struct lookahead_entry *lookahead_src = NULL;
   if (cm->current_video_frame > 0)
     lookahead_src = av1_lookahead_peek(cpi->lookahead, 0);
-  if (lookahead_src != NULL &&
-      ((cpi->oxcf.error_resilient_mode |
-        ((lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT) != 0)) ||
-       (cpi->oxcf.s_frame_mode |
-        ((lookahead_src->flags & AOM_EFLAG_SET_S_FRAME) != 0))) &&
-      !(rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY))) {
-    cm->show_existing_frame = 0;
+
+  int use_show_existing = 1;
+  if (lookahead_src != NULL) {
+    const int is_error_resilient =
+        cpi->oxcf.error_resilient_mode ||
+        (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT);
+    const int is_s_frame = cpi->oxcf.s_frame_mode ||
+                           (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME);
+    const int is_key_frame =
+        (rc->frames_to_key == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY);
+    use_show_existing = !(is_error_resilient || is_s_frame) || is_key_frame;
   }
 
-  if (oxcf->pass == 2 && cm->show_existing_frame) {
+  if (oxcf->pass == 2 && cm->show_existing_frame && use_show_existing) {
     // Manage the source buffer and flush out the source frame that has been
     // coded already; Also get prepared for PSNR calculation if needed.
     if ((source = av1_lookahead_pop(cpi->lookahead, flush)) == NULL) {
@@ -6415,3 +6388,50 @@ int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n) {
   const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
   return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
 }
+
+aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) {
+  if (!cpi) return NULL;
+
+  uint8_t header_buf[512] = { 0 };
+  const uint32_t sequence_header_size =
+      write_sequence_header_obu(cpi, &header_buf[0]);
+  assert(sequence_header_size <= sizeof(header_buf));
+  if (sequence_header_size == 0) return NULL;
+
+  const size_t obu_header_size = 1;
+  const size_t size_field_size = aom_uleb_size_in_bytes(sequence_header_size);
+  const size_t payload_offset = obu_header_size + size_field_size;
+
+  if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL;
+  memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size);
+
+  if (write_obu_header(OBU_SEQUENCE_HEADER, 0, &header_buf[0]) !=
+      obu_header_size) {
+    return NULL;
+  }
+
+  size_t coded_size_field_size = 0;
+  if (aom_uleb_encode(sequence_header_size, size_field_size,
+                      &header_buf[obu_header_size],
+                      &coded_size_field_size) != 0) {
+    return NULL;
+  }
+  assert(coded_size_field_size == size_field_size);
+
+  aom_fixed_buf_t *global_headers =
+      (aom_fixed_buf_t *)malloc(sizeof(*global_headers));
+  if (!global_headers) return NULL;
+
+  const size_t global_header_buf_size =
+      obu_header_size + size_field_size + sequence_header_size;
+
+  global_headers->buf = malloc(global_header_buf_size);
+  if (!global_headers->buf) {
+    free(global_headers);
+    return NULL;
+  }
+
+  memcpy(global_headers->buf, &header_buf[0], global_header_buf_size);
+  global_headers->sz = global_header_buf_size;
+  return global_headers;
+}
diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h
index 2b7ab711d..ee7fc4637 100644
--- a/third_party/aom/av1/encoder/encoder.h
+++ b/third_party/aom/av1/encoder/encoder.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_ENCODER_H_
-#define AV1_ENCODER_ENCODER_H_
+#ifndef AOM_AV1_ENCODER_ENCODER_H_
+#define AOM_AV1_ENCODER_ENCODER_H_
 
 #include <stdio.h>
 
@@ -142,7 +142,6 @@ typedef struct AV1EncoderConfig {
   int noise_sensitivity;  // pre processing blur: recommendation 0
   int sharpness;          // sharpening output: recommendation 0:
   int speed;
-  int dev_sf;
   // maximum allowed bitrate for any intra frame in % of bitrate target.
   unsigned int rc_max_intra_bitrate_pct;
   // maximum allowed bitrate for any inter frame in % of bitrate target.
@@ -249,6 +248,7 @@ typedef struct AV1EncoderConfig {
   int min_gf_interval;
   int max_gf_interval;
 
+  int row_mt;
   int tile_columns;
   int tile_rows;
   int tile_width_count;
@@ -309,6 +309,9 @@ typedef struct AV1EncoderConfig {
   float noise_level;
   int noise_block_size;
 #endif
+
+  unsigned int chroma_subsampling_x;
+  unsigned int chroma_subsampling_y;
 } AV1EncoderConfig;
 
 static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) {
@@ -401,6 +404,43 @@ typedef struct FRAME_COUNTS {
                                 [SWITCHABLE_FILTERS];
 } FRAME_COUNTS;
 
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400
+
+typedef struct {
+  int ready;
+  double a;
+  double b;
+  double dist_mean;
+  double ld_mean;
+  double sse_mean;
+  double sse_sse_mean;
+  double sse_ld_mean;
+  int num;
+  double dist_sum;
+  double ld_sum;
+  double sse_sum;
+  double sse_sse_sum;
+  double sse_ld_sum;
+} InterModeRdModel;
+
+typedef struct {
+  int idx;
+  int64_t rd;
+} RdIdxPair;
+// TODO(angiebird): This is an estimated size. We still need to figure what is
+// the maximum number of modes.
+#define MAX_INTER_MODES 1024
+typedef struct inter_modes_info {
+  int num;
+  MB_MODE_INFO mbmi_arr[MAX_INTER_MODES];
+  int mode_rate_arr[MAX_INTER_MODES];
+  int64_t sse_arr[MAX_INTER_MODES];
+  int64_t est_rd_arr[MAX_INTER_MODES];
+  RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES];
+} InterModesInfo;
+#endif
+
 // TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
 typedef struct TileDataEnc {
   TileInfo tile_info;
@@ -411,8 +451,18 @@ typedef struct TileDataEnc {
   CFL_CTX cfl;
   DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
   uint8_t allow_update_cdf;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+  InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
+  InterModesInfo inter_modes_info;
+#endif
 } TileDataEnc;
 
+typedef struct {
+  TOKENEXTRA *start;
+  TOKENEXTRA *stop;
+  unsigned int count;
+} TOKENLIST;
+
 typedef struct RD_COUNTS {
   int64_t comp_pred_diff[REFERENCE_MODES];
   // Stores number of 4x4 blocks using global motion per reference frame.
@@ -427,11 +477,14 @@ typedef struct ThreadData {
   FRAME_COUNTS *counts;
   PC_TREE *pc_tree;
   PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+  uint32_t *hash_value_buffer[2][2];
   int32_t *wsrc_buf;
   int32_t *mask_buf;
   uint8_t *above_pred_buf;
   uint8_t *left_pred_buf;
   PALETTE_BUFFER *palette_buffer;
+  CONV_BUF_TYPE *tmp_conv_dst;
+  uint8_t *tmp_obmc_bufs[2];
   int intrabc_used_this_tile;
 } ThreadData;
 
@@ -502,6 +555,7 @@ typedef struct AV1_COMP {
   int previous_index;
   int cur_poc;  // DebugInfo
 
+  unsigned int row_mt;
   int scaled_ref_idx[REF_FRAMES];
   int ref_fb_idx[REF_FRAMES];
   int refresh_fb_idx;  // ref frame buffer index to refresh
@@ -647,13 +701,12 @@ typedef struct AV1_COMP {
 
   search_site_config ss_cfg;
 
-  int multi_arf_allowed;
-
   TileDataEnc *tile_data;
   int allocated_tiles;  // Keep track of memory allocated for tiles.
 
   TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
   unsigned int tok_count[MAX_TILE_ROWS][MAX_TILE_COLS];
+  TOKENLIST *tplist[MAX_TILE_ROWS][MAX_TILE_COLS];
 
   TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
 
@@ -703,8 +756,13 @@ typedef struct AV1_COMP {
 #if CONFIG_DENOISE
   struct aom_denoise_and_model_t *denoise_and_model;
 #endif
+  // Stores the default value of skip flag depending on chroma format
+  // Set as 1 for monochrome and 3 for other color formats
+  int default_interp_skip_flags;
+  int preserve_arf_as_gld;
 } AV1_COMP;
 
+// Must not be called more than once.
 void av1_initialize_enc(void);
 
 struct AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
@@ -833,6 +891,22 @@ static INLINE unsigned int allocated_tokens(TileInfo tile, int sb_size_log2,
   return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes);
 }
 
+static INLINE void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col,
+                                 int mi_row, TOKENEXTRA **tok, int sb_size_log2,
+                                 int num_planes) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tile_cols;
+  TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo *const tile_info = &this_tile->tile_info;
+
+  const int tile_mb_cols =
+      (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
+  const int tile_mb_row = (mi_row - tile_info->mi_row_start + 2) >> 2;
+
+  *tok = cpi->tile_tok[tile_row][tile_col] +
+         get_token_alloc(tile_mb_row, tile_mb_cols, sb_size_log2, num_planes);
+}
+
 void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags);
 
 #define ALT_MIN_LAG 3
@@ -885,8 +959,27 @@ static INLINE int av1_frame_scaled(const AV1_COMMON *cm) {
   return !av1_superres_scaled(cm) && av1_resize_scaled(cm);
 }
 
+// Don't allow a show_existing_frame to coincide with an error resilient
+// frame. An exception can be made for a forward keyframe since it has no
+// previous dependencies.
+static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) {
+  return cm->show_existing_frame &&
+         (!cm->error_resilient_mode || cm->frame_type == KEY_FRAME);
+}
+
+// Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon
+// failure. When a non-NULL aom_fixed_buf_t pointer is returned by this
+// function, the memory must be freed by the caller. Both the buf member of the
+// aom_fixed_buf_t, and the aom_fixed_buf_t pointer itself must be freed. Memory
+// returned must be freed via call to free().
+//
+// Note: The OBU returned is in Low Overhead Bitstream Format. Specifically,
+// the obu_has_size_field bit is set, and the buffer contains the obu_size
+// field.
+aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_ENCODER_H_
+#endif  // AOM_AV1_ENCODER_ENCODER_H_
diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c
index 81f360733..5a31d93d7 100644
--- a/third_party/aom/av1/encoder/encodetxb.c
+++ b/third_party/aom/av1/encoder/encodetxb.c
@@ -133,6 +133,38 @@ static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
   return error;
 }
 
+static const int8_t eob_to_pos_small[33] = {
+  0, 1, 2,                                        // 0-2
+  3, 3,                                           // 3-4
+  4, 4, 4, 4,                                     // 5-8
+  5, 5, 5, 5, 5, 5, 5, 5,                         // 9-16
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6  // 17-32
+};
+
+static const int8_t eob_to_pos_large[17] = {
+  6,                               // place holder
+  7,                               // 33-64
+  8,  8,                           // 65-128
+  9,  9,  9,  9,                   // 129-256
+  10, 10, 10, 10, 10, 10, 10, 10,  // 257-512
+  11                               // 513-
+};
+
+static INLINE int get_eob_pos_token(const int eob, int *const extra) {
+  int t;
+
+  if (eob < 33) {
+    t = eob_to_pos_small[eob];
+  } else {
+    const int e = AOMMIN((eob - 1) >> 5, 16);
+    t = eob_to_pos_large[e];
+  }
+
+  *extra = eob - k_eob_group_start[t];
+
+  return t;
+}
+
 #if CONFIG_ENTROPY_STATS
 void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size,
                             TX_CLASS tx_class, PLANE_TYPE plane,
@@ -464,8 +496,12 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
                           aom_writer *w, int blk_row, int blk_col, int plane,
                           TX_SIZE tx_size, const tran_low_t *tcoeff,
                           uint16_t eob, TXB_CTX *txb_ctx) {
-  const PLANE_TYPE plane_type = get_plane_type(plane);
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  aom_write_symbol(w, eob == 0,
+                   ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2);
+  if (eob == 0) return;
+  const PLANE_TYPE plane_type = get_plane_type(plane);
   const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
                                           tx_size, cm->reduced_tx_set_used);
   const TX_CLASS tx_class = tx_type_to_class[tx_type];
@@ -475,18 +511,10 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
   const int bwl = get_txb_bwl(tx_size);
   const int width = get_txb_wide(tx_size);
   const int height = get_txb_high(tx_size);
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
   uint8_t levels_buf[TX_PAD_2D];
   uint8_t *const levels = set_levels(levels_buf, width);
   DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
-
-  aom_write_symbol(w, eob == 0,
-                   ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2);
-  if (plane == 0 && eob == 0) {
-    assert(tx_type == DCT_DCT);
-  }
-  if (eob == 0) return;
-
   av1_txb_init_levels(tcoeff, width, height, levels);
 
   av1_write_tx_type(cm, xd, blk_row, blk_col, plane, tx_size, w);
diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h
index 0442cc613..40ae343b0 100644
--- a/third_party/aom/av1/encoder/encodetxb.h
+++ b/third_party/aom/av1/encoder/encodetxb.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef ENCODETXB_H_
-#define ENCODETXB_H_
+#ifndef AOM_AV1_ENCODER_ENCODETXB_H_
+#define AOM_AV1_ENCODER_ENCODETXB_H_
 
 #include "config/aom_config.h"
 
@@ -84,4 +84,4 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
 }
 #endif
 
-#endif  // COEFFS_CODING_H_
+#endif  // AOM_AV1_ENCODER_ENCODETXB_H_
diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c
index 637d6824c..e8ac30bb5 100644
--- a/third_party/aom/av1/encoder/ethread.c
+++ b/third_party/aom/av1/encoder/ethread.c
@@ -27,7 +27,8 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
   td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag;
 }
 
-static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
+static int enc_worker_hook(void *arg1, void *unused) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
   AV1_COMP *const cpi = thread_data->cpi;
   const AV1_COMMON *const cm = &cpi->common;
   const int tile_cols = cm->tile_cols;
@@ -47,88 +48,141 @@ static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
   return 1;
 }
 
-void av1_encode_tiles_mt(AV1_COMP *cpi) {
+static void create_enc_workers(AV1_COMP *cpi, int num_workers) {
   AV1_COMMON *const cm = &cpi->common;
-  const int tile_cols = cm->tile_cols;
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols);
-  int i;
 
-  av1_init_tile_data(cpi);
+  CHECK_MEM_ERROR(cm, cpi->workers,
+                  aom_malloc(num_workers * sizeof(*cpi->workers)));
 
-  // Only run once to create threads and allocate thread data.
-  if (cpi->num_workers == 0) {
-    CHECK_MEM_ERROR(cm, cpi->workers,
-                    aom_malloc(num_workers * sizeof(*cpi->workers)));
+  CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+                  aom_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
 
-    CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
-                    aom_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
+  for (int i = 0; i < num_workers; i++) {
+    AVxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
 
-    for (i = 0; i < num_workers; i++) {
-      AVxWorker *const worker = &cpi->workers[i];
-      EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
+    ++cpi->num_workers;
+    winterface->init(worker);
+
+    thread_data->cpi = cpi;
+
+    if (i < num_workers - 1) {
+      // Allocate thread data.
+      CHECK_MEM_ERROR(cm, thread_data->td,
+                      aom_memalign(32, sizeof(*thread_data->td)));
+      av1_zero(*thread_data->td);
+
+      // Set up pc_tree.
+      thread_data->td->pc_tree = NULL;
+      av1_setup_pc_tree(cm, thread_data->td);
+
+      CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
+                      (uint8_t *)aom_memalign(
+                          16, MAX_MB_PLANE * MAX_SB_SQUARE *
+                                  sizeof(*thread_data->td->above_pred_buf)));
+      CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf,
+                      (uint8_t *)aom_memalign(
+                          16, MAX_MB_PLANE * MAX_SB_SQUARE *
+                                  sizeof(*thread_data->td->left_pred_buf)));
+
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->wsrc_buf,
+          (int32_t *)aom_memalign(
+              16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf)));
+
+      for (int x = 0; x < 2; x++)
+        for (int y = 0; y < 2; y++)
+          CHECK_MEM_ERROR(
+              cm, thread_data->td->hash_value_buffer[x][y],
+              (uint32_t *)aom_malloc(
+                  AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+                  sizeof(*thread_data->td->hash_value_buffer[0][0])));
+
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->mask_buf,
+          (int32_t *)aom_memalign(
+              16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf)));
+      // Allocate frame counters in thread data.
+      CHECK_MEM_ERROR(cm, thread_data->td->counts,
+                      aom_calloc(1, sizeof(*thread_data->td->counts)));
+
+      // Allocate buffers used by palette coding mode.
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->palette_buffer,
+          aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
+
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->tmp_conv_dst,
+          aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+                               sizeof(*thread_data->td->tmp_conv_dst)));
+      for (int j = 0; j < 2; ++j) {
+        CHECK_MEM_ERROR(
+            cm, thread_data->td->tmp_obmc_bufs[j],
+            aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                 sizeof(*thread_data->td->tmp_obmc_bufs[j])));
+      }
 
-      ++cpi->num_workers;
-      winterface->init(worker);
+      // Create threads
+      if (!winterface->reset(worker))
+        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                           "Tile encoder thread creation failed");
+    } else {
+      // Main thread acts as a worker and uses the thread data in cpi.
+      thread_data->td = &cpi->td;
+    }
+    winterface->sync(worker);
+  }
+}
 
-      thread_data->cpi = cpi;
+static void launch_enc_workers(AV1_COMP *cpi, int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  // Encode a frame
+  for (int i = 0; i < num_workers; i++) {
+    AVxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
 
-      if (i < num_workers - 1) {
-        // Allocate thread data.
-        CHECK_MEM_ERROR(cm, thread_data->td,
-                        aom_memalign(32, sizeof(*thread_data->td)));
-        av1_zero(*thread_data->td);
+    // Set the starting tile for each thread.
+    thread_data->start = i;
 
-        // Set up pc_tree.
-        thread_data->td->pc_tree = NULL;
-        av1_setup_pc_tree(cm, thread_data->td);
+    if (i == cpi->num_workers - 1)
+      winterface->execute(worker);
+    else
+      winterface->launch(worker);
+  }
+}
 
-        CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
-                        (uint8_t *)aom_memalign(
-                            16, MAX_MB_PLANE * MAX_SB_SQUARE *
-                                    sizeof(*thread_data->td->above_pred_buf)));
-        CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf,
-                        (uint8_t *)aom_memalign(
-                            16, MAX_MB_PLANE * MAX_SB_SQUARE *
-                                    sizeof(*thread_data->td->left_pred_buf)));
+static void sync_enc_workers(AV1_COMP *cpi, int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 
-        CHECK_MEM_ERROR(
-            cm, thread_data->td->wsrc_buf,
-            (int32_t *)aom_memalign(
-                16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf)));
-        CHECK_MEM_ERROR(
-            cm, thread_data->td->mask_buf,
-            (int32_t *)aom_memalign(
-                16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf)));
-        // Allocate frame counters in thread data.
-        CHECK_MEM_ERROR(cm, thread_data->td->counts,
-                        aom_calloc(1, sizeof(*thread_data->td->counts)));
-
-        // Allocate buffers used by palette coding mode.
-        CHECK_MEM_ERROR(
-            cm, thread_data->td->palette_buffer,
-            aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
-
-        // Create threads
-        if (!winterface->reset(worker))
-          aom_internal_error(&cm->error, AOM_CODEC_ERROR,
-                             "Tile encoder thread creation failed");
-      } else {
-        // Main thread acts as a worker and uses the thread data in cpi.
-        thread_data->td = &cpi->td;
-      }
+  // Encoding ends.
+  for (int i = 0; i < num_workers; i++) {
+    AVxWorker *const worker = &cpi->workers[i];
+    winterface->sync(worker);
+  }
+}
 
-      winterface->sync(worker);
+static void accumulate_counters_enc_workers(AV1_COMP *cpi, int num_workers) {
+  for (int i = 0; i < num_workers; i++) {
+    AVxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+    cpi->intrabc_used |= thread_data->td->intrabc_used_this_tile;
+    // Accumulate counters.
+    if (i < cpi->num_workers - 1) {
+      av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts);
+      accumulate_rd_opt(&cpi->td, thread_data->td);
+      cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count;
     }
-  } else {
-    num_workers = AOMMIN(num_workers, cpi->num_workers);
   }
+}
 
-  for (i = 0; i < num_workers; i++) {
+static void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                int num_workers) {
+  for (int i = 0; i < num_workers; i++) {
     AVxWorker *const worker = &cpi->workers[i];
     EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
 
-    worker->hook = (AVxWorkerHook)enc_worker_hook;
+    worker->hook = hook;
     worker->data1 = thread_data;
     worker->data2 = NULL;
 
@@ -139,47 +193,59 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
       thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf;
       thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf;
       thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf;
+      for (int x = 0; x < 2; x++) {
+        for (int y = 0; y < 2; y++) {
+          memcpy(thread_data->td->hash_value_buffer[x][y],
+                 cpi->td.mb.hash_value_buffer[x][y],
+                 AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+                     sizeof(*thread_data->td->hash_value_buffer[0][0]));
+          thread_data->td->mb.hash_value_buffer[x][y] =
+              thread_data->td->hash_value_buffer[x][y];
+        }
+      }
       thread_data->td->mb.mask_buf = thread_data->td->mask_buf;
     }
     if (thread_data->td->counts != &cpi->counts) {
       memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
     }
 
-    if (i < num_workers - 1)
+    if (i < num_workers - 1) {
       thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
-  }
-
-  // Encode a frame
-  for (i = 0; i < num_workers; i++) {
-    AVxWorker *const worker = &cpi->workers[i];
-    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
-
-    // Set the starting tile for each thread.
-    thread_data->start = i;
+      thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+      for (int j = 0; j < 2; ++j) {
+        thread_data->td->mb.tmp_obmc_bufs[j] =
+            thread_data->td->tmp_obmc_bufs[j];
+      }
 
-    if (i == cpi->num_workers - 1)
-      winterface->execute(worker);
-    else
-      winterface->launch(worker);
+      thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+      for (int j = 0; j < 2; ++j) {
+        thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] =
+            thread_data->td->mb.tmp_obmc_bufs[j];
+      }
+    }
   }
+}
 
-  // Encoding ends.
-  for (i = 0; i < num_workers; i++) {
-    AVxWorker *const worker = &cpi->workers[i];
-    winterface->sync(worker);
-  }
+void av1_encode_tiles_mt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols * tile_rows);
 
-  for (i = 0; i < num_workers; i++) {
-    AVxWorker *const worker = &cpi->workers[i];
-    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
-    cpi->intrabc_used |= thread_data->td->intrabc_used_this_tile;
-    // Accumulate counters.
-    if (i < cpi->num_workers - 1) {
-      av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts);
-      accumulate_rd_opt(&cpi->td, thread_data->td);
-      cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count;
-    }
+  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows)
+    av1_alloc_tile_data(cpi);
+
+  av1_init_tile_data(cpi);
+  // Only run once to create threads and allocate thread data.
+  if (cpi->num_workers == 0) {
+    create_enc_workers(cpi, num_workers);
+  } else {
+    num_workers = AOMMIN(num_workers, cpi->num_workers);
   }
+  prepare_enc_workers(cpi, enc_worker_hook, num_workers);
+  launch_enc_workers(cpi, num_workers);
+  sync_enc_workers(cpi, num_workers);
+  accumulate_counters_enc_workers(cpi, num_workers);
 }
 
 // Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int'
diff --git a/third_party/aom/av1/encoder/ethread.h b/third_party/aom/av1/encoder/ethread.h
index b6b1fed4e..5de4b4803 100644
--- a/third_party/aom/av1/encoder/ethread.h
+++ b/third_party/aom/av1/encoder/ethread.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_ETHREAD_H_
-#define AV1_ENCODER_ETHREAD_H_
+#ifndef AOM_AV1_ENCODER_ETHREAD_H_
+#define AOM_AV1_ENCODER_ETHREAD_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -34,4 +34,4 @@ void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts,
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_ETHREAD_H_
+#endif  // AOM_AV1_ENCODER_ETHREAD_H_
diff --git a/third_party/aom/av1/encoder/extend.h b/third_party/aom/av1/encoder/extend.h
index 48178b964..e0432cc97 100644
--- a/third_party/aom/av1/encoder/extend.h
+++ b/third_party/aom/av1/encoder/extend.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_EXTEND_H_
-#define AV1_ENCODER_EXTEND_H_
+#ifndef AOM_AV1_ENCODER_EXTEND_H_
+#define AOM_AV1_ENCODER_EXTEND_H_
 
 #include "aom_scale/yv12config.h"
 #include "aom/aom_integer.h"
@@ -29,4 +29,4 @@ void av1_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_EXTEND_H_
+#endif  // AOM_AV1_ENCODER_EXTEND_H_
diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c
index ef0800c79..69dd20c52 100644
--- a/third_party/aom/av1/encoder/firstpass.c
+++ b/third_party/aom/av1/encoder/firstpass.c
@@ -31,6 +31,7 @@
 #include "av1/encoder/aq_variance.h"
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/block.h"
+#include "av1/encoder/dwt.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encodemv.h"
@@ -39,7 +40,7 @@
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/rd.h"
-#include "av1/encoder/dwt.h"
+#include "av1/encoder/reconinter_enc.h"
 
 #define OUTPUT_FPF 0
 #define ARF_STATS_OUTPUT 0
@@ -51,9 +52,10 @@
 #define FACTOR_PT_LOW 0.70
 #define FACTOR_PT_HIGH 0.90
 #define FIRST_PASS_Q 10.0
-#define GF_MAX_BOOST 96.0
+#define GF_MAX_BOOST 90.0
 #define INTRA_MODE_PENALTY 1024
-#define KF_MAX_BOOST 128.0
+#define KF_MIN_FRAME_BOOST 80.0
+#define KF_MAX_FRAME_BOOST 128.0
 #define MIN_ARF_GF_BOOST 240
 #define MIN_DECAY_FACTOR 0.01
 #define MIN_KF_BOOST 300
@@ -62,6 +64,7 @@
 #define DEFAULT_GRP_WEIGHT 1.0
 #define RC_FACTOR_MIN 0.75
 #define RC_FACTOR_MAX 1.75
+#define MIN_FWD_KF_INTERVAL 8
 
 #define NCOUNT_INTRA_THRESH 8192
 #define NCOUNT_INTRA_FACTOR 3
@@ -1562,576 +1565,9 @@ static int calculate_boost_bits(int frame_count, int boost,
                 0);
 }
 
-#if USE_GF16_MULTI_LAYER
-// === GF Group of 16 ===
-#define GF_INTERVAL_16 16
-#define GF_FRAME_PARAMS (REF_FRAMES + 5)
-
-// GF Group of 16: multi-layer hierarchical coding structure
-//   1st Layer: Frame 0 and Frame 16 (ALTREF)
-//   2nd Layer: Frame 8 (ALTREF2)
-//   3rd Layer: Frame 4 and 12 (ALTREF2)
-//   4th Layer: Frame 2, 6, 10, and 14 (BWDREF)
-//   5th Layer: Frame 1, 3, 5, 7, 9, 11, 13, and 15
-static const unsigned char gf16_multi_layer_params[][GF_FRAME_PARAMS] = {
-  // gf_group->index: coding order
-  // (Frame #)      : display order
-  {
-      // gf_group->index == 0 (Frame 0)
-      OVERLAY_UPDATE,  // update_type
-      0,               // arf_src_offset
-      0,               // brf_src_offset
-      // References (previous ===> current)
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx (BWDREF_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      ALTREF_FRAME,  // Index (current) of reference to get updated
-      GOLDEN_FRAME   // cpi->refresh_golden_frame = 1
-  },
-  {
-      // gf_group->index == 1 (Frame 16)
-      ARF_UPDATE,          // update_type
-      GF_INTERVAL_16 - 1,  // arf_src_offset
-      0,                   // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      ALTREF_FRAME,   // cpi->alt_fb_idx ===> cpi->gld_fb_idx (GOLDEN_FRAME)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx (BWDREF_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
-      GOLDEN_FRAME,   // cpi->gld_fb_idx ===> cpi->alt_fb_idx (ALTREF_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      ALTREF_FRAME,  // Index (current) of reference to get updated
-      ALTREF_FRAME   // cpi->refresh_alt_ref_frame = 1
-  },
-  {
-      // gf_group->index == 2 (Frame 8)
-      INTNL_ARF_UPDATE,           // update_type
-      (GF_INTERVAL_16 >> 1) - 1,  // arf_src_offset
-      0,                          // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx (BWDREF_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      ALTREF2_FRAME,  // Index (current) of reference to get updated
-      ALTREF2_FRAME   // cpi->refresh_alt2_ref_frame = 1
-  },
-  {
-      // gf_group->index == 3 (Frame 4)
-      INTNL_ARF_UPDATE,           // update_type
-      (GF_INTERVAL_16 >> 2) - 1,  // arf_src_offset
-      0,                          // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx
-                      // (BWDREF_FRAME)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx
-                      // (ALTREF2_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      ALTREF2_FRAME,  // Index (current) of reference to get updated
-      ALTREF2_FRAME   // cpi->refresh_alt2_ref_frame = 1
-  },
-  {
-      // gf_group->index == 4 (Frame 2)
-      BRF_UPDATE,  // update_type
-      0,           // arf_src_offset
-      1,           // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx
-                      // (BWDREF_FRAME)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx
-                      // (ALTREF2_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      REF_FRAMES,   // Index (current) of reference to get updated
-      BWDREF_FRAME  // cpi->refresh_bwd_ref_frame = 1
-  },
-  {
-      // gf_group->index == 5 (Frame 1)
-      LAST_BIPRED_UPDATE,  // update_type
-      0,                   // arf_src_offset
-      0,                   // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx ===> cpi->alt_fb_idx (ALTREF_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx ===> cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      LAST3_FRAME,  // Index (current) of reference to get updated
-      LAST_FRAME    // cpi->refresh_last_frame = 1
-  },
-  {
-      // gf_group->index == 6 (Frame 3)
-      LF_UPDATE,  // update_type
-      0,          // arf_src_offset
-      0,          // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
-                      // LAST_FRAME]
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx ===> cpi->alt_fb_idx (ALTREF_FRAME)
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
-                      // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      LAST3_FRAME,  // Index (current) of reference to get updated
-      LAST_FRAME    // cpi->refresh_last_frame = 1
-  },
-  {
-      // gf_group->index == 7 (Frame 4 - OVERLAY)
-      INTNL_OVERLAY_UPDATE,  // update_type
-      0,                     // arf_src_offset
-      0,                     // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx (BWDREF_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      BWDREF_FRAME,  // Index (current) of reference to get updated
-      ALTREF2_FRAME  // cpi->refresh_alt2_ref_frame = 1
-  },
-  {
-      // gf_group->index == 8 (Frame 6)
-      BRF_UPDATE,  // update_type
-      0,           // arf_src_offset
-      1,           // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
-                      // LAST_FRAME]
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx -> cpi->bwd_fb_idx (BWDREF_FRAME)
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
-                      // cpi->alt2_fb_idx (ALTREF2_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      ALTREF2_FRAME,  // Index (current) of reference to get updated
-      BWDREF_FRAME    // cpi->refresh_bwd_frame = 1
-  },
-  {
-      // gf_group->index == 9 (Frame 5)
-      LAST_BIPRED_UPDATE,  // update_type
-      0,                   // arf_src_offset
-      0,                   // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      LAST3_FRAME,  // Index (current) of reference to get updated
-      LAST_FRAME    // cpi->refresh_last_frame = 1
-  },
-  {
-      // gf_group->index == 10 (Frame 7)
-      LF_UPDATE,  // update_type
-      0,          // arf_src_offset
-      0,          // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
-                      // LAST_FRAME]
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
-                      // cpi->alt2_fb_idx (ALTREF2_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      LAST3_FRAME,  // Index (current) of reference to get updated
-      LAST_FRAME    // cpi->refresh_last_frame = 1
-  },
-  {
-      // gf_group->index == 11 (Frame 8 - OVERLAY)
-      INTNL_OVERLAY_UPDATE,  // update_type
-      0,                     // arf_src_offset
-      0,                     // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx (BWDREF_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      BWDREF_FRAME,  // Index (current) of reference to get updated
-      ALTREF2_FRAME  // cpi->refresh_alt2_ref_frame = 1
-  },
-  {
-      // gf_group->index == 12 (Frame 12)
-      INTNL_ARF_UPDATE,           // update_type
-      (GF_INTERVAL_16 >> 2) - 1,  // arf_src_offset
-      0,                          // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
-                      // LAST_FRAME]
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST2_FRAME,    //  cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
-                      //  cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
-                      // cpi->bwd_fb_idx (BWDREF_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      ALTREF2_FRAME,  // Index (current) of reference to get updated
-      ALTREF2_FRAME   // cpi->refresh_alt2_ref_frame = 1
-  },
-  {
-      // gf_group->index == 13 (Frame 10)
-      BRF_UPDATE,  // update_type
-      0,           // arf_src_offset
-      1,           // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      ALTREF2_FRAME,  // Index (current) of reference to get updated
-      BWDREF_FRAME    // cpi->refresh_bwd_frame = 1
-  },
-  {
-      // gf_group->index == 14 (Frame 9)
-      LAST_BIPRED_UPDATE,  // update_type
-      0,                   // arf_src_offset
-      0,                   // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      LAST3_FRAME,  // Index (current) of reference to get updated
-      LAST_FRAME    // cpi->refresh_last_frame = 1
-  },
-  {
-      // gf_group->index == 15 (Frame 11)
-      LF_UPDATE,  // update_type
-      0,          // arf_src_offset
-      0,          // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
-                      // LAST_FRAME]
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
-                      // cpi->alt2_fb_idx (ALTREF2_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      LAST3_FRAME,  // Index (current) of reference to get updated
-      LAST_FRAME    // cpi->refresh_last_frame = 1
-  },
-  {
-      // gf_group->index == 16 (Frame 12 - OVERLAY)
-      INTNL_OVERLAY_UPDATE,  // update_type
-      0,                     // arf_src_offset
-      0,                     // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx (BWDREF_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      BWDREF_FRAME,  // Index (current) of reference to get updated
-      ALTREF2_FRAME  // cpi->refresh_alt2_ref_frame = 1
-  },
-  {
-      // gf_group->index == 17 (Frame 14)
-      BRF_UPDATE,  // update_type
-      0,           // arf_src_offset
-      1,           // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
-                      // LAST_FRAME]
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
-                      // cpi->bwd_fb_idx (BWDREF_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      BWDREF_FRAME,  // Index (current) of reference to get updated
-      BWDREF_FRAME   // cpi->refresh_bwd_frame = 1
-  },
-  {
-      // gf_group->index == 18 (Frame 13)
-      LAST_BIPRED_UPDATE,  // update_type
-      0,                   // arf_src_offset
-      0,                   // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx (BWDREF_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      LAST3_FRAME,  // Index (current) of reference to get updated
-      LAST_FRAME    // cpi->refresh_last_frame = 1
-  },
-  {
-      // gf_group->index == 19 (Frame 15)
-      LF_UPDATE,  // update_type
-      0,          // arf_src_offset
-      0,          // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
-                      // LAST_FRAME]
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
-                      // cpi->bwd_fb_idx (BWDREF_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      LAST3_FRAME,  // Index (current) of reference to get updated
-      LAST_FRAME    // cpi->refresh_last_frame = 1
-  },
-  {
-      // gf_group->index == 20 (Frame 16 - OVERLAY: Belonging to the next GF
-      // group)
-      OVERLAY_UPDATE,  // update_type
-      0,               // arf_src_offset
-      0,               // brf_src_offset
-      // Reference frame indexes (previous ===> current)
-      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
-      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
-      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
-                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
-      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
-      BWDREF_FRAME,   // cpi->bwd_fb_idx (BWDREF_FRAME)
-      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
-      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
-      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
-      // Refreshment (index, flag)
-      ALTREF_FRAME,  // Index (current) of reference to get updated
-      GOLDEN_FRAME   // cpi->refresh_golden_frame = 1
-  }
-};
-
-// === GF Group of 16 ===
-static void define_gf_group_structure_16(AV1_COMP *cpi) {
-  RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &twopass->gf_group;
-  const int key_frame = cpi->common.frame_type == KEY_FRAME;
-
-  assert(rc->baseline_gf_interval == GF_INTERVAL_16);
-
-  // Total number of frames to consider for GF group of 16:
-  //   = GF group interval + number of OVERLAY's
-  //   = rc->baseline_gf_interval + MAX_EXT_ARFS + 1 + 1
-  // NOTE: The OVERLAY frame for the next GF group also needs to consider to
-  //       prepare for the reference frame index mapping.
-
-  const int gf_update_frames = rc->baseline_gf_interval + MAX_EXT_ARFS + 2;
-
-  for (int frame_index = 0; frame_index < gf_update_frames; ++frame_index) {
-    int param_idx = 0;
-
-    // Treat KEY_FRAME differently
-    if (frame_index == 0 && key_frame) {
-      gf_group->update_type[frame_index] = KF_UPDATE;
-
-      gf_group->rf_level[frame_index] = KF_STD;
-      gf_group->arf_src_offset[frame_index] = 0;
-      gf_group->brf_src_offset[frame_index] = 0;
-      gf_group->bidir_pred_enabled[frame_index] = 0;
-      for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx)
-        gf_group->ref_fb_idx_map[frame_index][ref_idx] = ref_idx;
-      gf_group->refresh_idx[frame_index] = cpi->ref_fb_idx[LAST_FRAME - 1];
-      gf_group->refresh_flag[frame_index] = cpi->ref_fb_idx[LAST_FRAME - 1];
-
-      continue;
-    }
-
-    // == update_type ==
-    gf_group->update_type[frame_index] =
-        gf16_multi_layer_params[frame_index][param_idx++];
-
-    // == rf_level ==
-    // Derive rf_level from update_type
-    switch (gf_group->update_type[frame_index]) {
-      case LF_UPDATE: gf_group->rf_level[frame_index] = INTER_NORMAL; break;
-      case ARF_UPDATE: gf_group->rf_level[frame_index] = GF_ARF_LOW; break;
-      case OVERLAY_UPDATE:
-        gf_group->rf_level[frame_index] = INTER_NORMAL;
-        break;
-      case BRF_UPDATE: gf_group->rf_level[frame_index] = GF_ARF_LOW; break;
-      case LAST_BIPRED_UPDATE:
-        gf_group->rf_level[frame_index] = INTER_NORMAL;
-        break;
-      case BIPRED_UPDATE: gf_group->rf_level[frame_index] = INTER_NORMAL; break;
-      case INTNL_ARF_UPDATE:
-        gf_group->rf_level[frame_index] = GF_ARF_LOW;
-        break;
-      case INTNL_OVERLAY_UPDATE:
-        gf_group->rf_level[frame_index] = INTER_NORMAL;
-        break;
-      default: gf_group->rf_level[frame_index] = INTER_NORMAL; break;
-    }
-
-    // == arf_src_offset ==
-    gf_group->arf_src_offset[frame_index] =
-        gf16_multi_layer_params[frame_index][param_idx++];
-
-    // == brf_src_offset ==
-    gf_group->brf_src_offset[frame_index] =
-        gf16_multi_layer_params[frame_index][param_idx++];
-
-    // == bidir_pred_enabled ==
-    // Derive bidir_pred_enabled from bidir_src_offset
-    gf_group->bidir_pred_enabled[frame_index] =
-        gf_group->brf_src_offset[frame_index] ? 1 : 0;
-
-    // == ref_fb_idx_map ==
-    for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx)
-      gf_group->ref_fb_idx_map[frame_index][ref_idx] =
-          gf16_multi_layer_params[frame_index][param_idx++];
-
-    // == refresh_idx ==
-    gf_group->refresh_idx[frame_index] =
-        gf16_multi_layer_params[frame_index][param_idx++];
-
-    // == refresh_flag ==
-    gf_group->refresh_flag[frame_index] =
-        gf16_multi_layer_params[frame_index][param_idx];
-  }
-
-  // Mark the ARF_UPDATE / INTNL_ARF_UPDATE and OVERLAY_UPDATE /
-  // INTNL_OVERLAY_UPDATE for rate allocation
-  // NOTE: Indexes are designed in the display order backward:
-  //       ALT[3] .. ALT[2] .. ALT[1] .. ALT[0],
-  //       but their coding order is as follows:
-  // ALT0-ALT2-ALT3 .. OVERLAY3 .. OVERLAY2-ALT1 .. OVERLAY1 .. OVERLAY0
-
-  const int num_arfs_in_gf = cpi->num_extra_arfs + 1;
-  const int sub_arf_interval = rc->baseline_gf_interval / num_arfs_in_gf;
-
-  // == arf_pos_for_ovrly ==: Position for OVERLAY
-  for (int arf_idx = 0; arf_idx < num_arfs_in_gf; arf_idx++) {
-    const int prior_num_arfs =
-        (arf_idx <= 1) ? num_arfs_in_gf : (num_arfs_in_gf - 1);
-    cpi->arf_pos_for_ovrly[arf_idx] =
-        sub_arf_interval * (num_arfs_in_gf - arf_idx) + prior_num_arfs;
-  }
-
-  // == arf_pos_in_gf ==: Position for ALTREF
-  cpi->arf_pos_in_gf[0] = 1;
-  cpi->arf_pos_in_gf[1] = cpi->arf_pos_for_ovrly[2] + 1;
-  cpi->arf_pos_in_gf[2] = 2;
-  cpi->arf_pos_in_gf[3] = 3;
-
-  // == arf_update_idx ==
-  // == arf_ref_idx ==
-  // NOTE: Due to the hierarchical nature of GF16, these two parameters only
-  //       relect the index to the nearest future overlay.
-  int start_frame_index = 0;
-  for (int arf_idx = (num_arfs_in_gf - 1); arf_idx >= 0; --arf_idx) {
-    const int end_frame_index = cpi->arf_pos_for_ovrly[arf_idx];
-    for (int frame_index = start_frame_index; frame_index <= end_frame_index;
-         ++frame_index) {
-      gf_group->arf_update_idx[frame_index] = arf_idx;
-      gf_group->arf_ref_idx[frame_index] = arf_idx;
-    }
-    start_frame_index = end_frame_index + 1;
-  }
-}
-#endif  // USE_GF16_MULTI_LAYER
-
 #if USE_SYMM_MULTI_LAYER
+// #define CHCEK_GF_PARAMETER
+#ifdef CHCEK_GF_PARAMETER
 void check_frame_params(GF_GROUP *const gf_group, int gf_interval,
                         int frame_nums) {
   static const char *update_type_strings[] = {
@@ -2149,9 +1585,15 @@ void check_frame_params(GF_GROUP *const gf_group, int gf_interval,
             gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i],
             gf_group->arf_update_idx[i], gf_group->pyramid_level[i]);
   }
+
+  fprintf(fid, "number of nodes in each level: \n");
+  for (int i = 0; i < MAX_PYRAMID_LVL; ++i) {
+    fprintf(fid, "lvl %d: %d ", i, gf_group->pyramid_lvl_nodes[i]);
+  }
+  fprintf(fid, "\n");
   fclose(fid);
 }
-
+#endif  // CHCEK_GF_PARAMETER
 static int update_type_2_rf_level(FRAME_UPDATE_TYPE update_type) {
   // Derive rf_level from update_type
   switch (update_type) {
@@ -2169,14 +1611,17 @@ static int update_type_2_rf_level(FRAME_UPDATE_TYPE update_type) {
 
 static void set_multi_layer_params(GF_GROUP *const gf_group, int l, int r,
                                    int *frame_ind, int arf_ind, int level) {
-  if (r - l == 2) {
-    // leaf node, not a look-ahead frame
-    gf_group->update_type[*frame_ind] = LF_UPDATE;
-    gf_group->arf_src_offset[*frame_ind] = 0;
-    gf_group->arf_pos_in_gf[*frame_ind] = 0;
-    gf_group->arf_update_idx[*frame_ind] = arf_ind;
-    gf_group->pyramid_level[*frame_ind] = level;
-    ++(*frame_ind);
+  if (r - l < 4) {
+    while (++l < r) {
+      // leaf nodes, not a look-ahead frame
+      gf_group->update_type[*frame_ind] = LF_UPDATE;
+      gf_group->arf_src_offset[*frame_ind] = 0;
+      gf_group->arf_pos_in_gf[*frame_ind] = 0;
+      gf_group->arf_update_idx[*frame_ind] = arf_ind;
+      gf_group->pyramid_level[*frame_ind] = 0;
+      ++gf_group->pyramid_lvl_nodes[0];
+      ++(*frame_ind);
+    }
   } else {
     int m = (l + r) / 2;
     int arf_pos_in_gf = *frame_ind;
@@ -2186,6 +1631,7 @@ static void set_multi_layer_params(GF_GROUP *const gf_group, int l, int r,
     gf_group->arf_pos_in_gf[*frame_ind] = 0;
     gf_group->arf_update_idx[*frame_ind] = 1;  // mark all internal ARF 1
     gf_group->pyramid_level[*frame_ind] = level;
+    ++gf_group->pyramid_lvl_nodes[level];
     ++(*frame_ind);
 
     // set parameters for frames displayed before this frame
@@ -2209,7 +1655,7 @@ static INLINE unsigned char get_pyramid_height(int pyramid_width) {
   assert(pyramid_width <= 16 && pyramid_width >= 4 &&
          "invalid gf interval for pyramid structure");
 
-  return pyramid_width == 16 ? 4 : (pyramid_width >= 8 ? 3 : 2);
+  return pyramid_width > 12 ? 4 : (pyramid_width > 6 ? 3 : 2);
 }
 
 static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group,
@@ -2217,6 +1663,10 @@ static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group,
   int frame_index = 0;
   gf_group->pyramid_height = get_pyramid_height(gf_interval);
 
+  assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL);
+
+  av1_zero_array(gf_group->pyramid_lvl_nodes, MAX_PYRAMID_LVL);
+
   // At the beginning of each GF group it will be a key or overlay frame,
   gf_group->update_type[frame_index] = OVERLAY_UPDATE;
   gf_group->arf_src_offset[frame_index] = 0;
@@ -2236,9 +1686,6 @@ static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group,
   // set parameters for the rest of the frames
   set_multi_layer_params(gf_group, 0, gf_interval, &frame_index, 0,
                          gf_group->pyramid_height - 1);
-
-  // check_frame_params(gf_group, gf_interval, frame_index);
-
   return frame_index;
 }
 
@@ -2248,8 +1695,8 @@ void define_customized_gf_group_structure(AV1_COMP *cpi) {
   GF_GROUP *const gf_group = &twopass->gf_group;
   const int key_frame = cpi->common.frame_type == KEY_FRAME;
 
-  assert(rc->baseline_gf_interval == 4 || rc->baseline_gf_interval == 8 ||
-         rc->baseline_gf_interval == 16);
+  assert(rc->baseline_gf_interval >= 4 &&
+         rc->baseline_gf_interval <= MAX_PYRAMID_SIZE);
 
   const int gf_update_frames =
       construct_multi_layer_gf_structure(gf_group, rc->baseline_gf_interval);
@@ -2305,8 +1752,9 @@ void define_customized_gf_group_structure(AV1_COMP *cpi) {
 
   // This parameter is useless?
   gf_group->arf_ref_idx[frame_index] = 0;
-
+#ifdef CHCEK_GF_PARAMETER
   check_frame_params(gf_group, rc->baseline_gf_interval, gf_update_frames);
+#endif
 }
 
 // It is an example of how to define a GF stucture manually. The function will
@@ -2447,16 +1895,10 @@ static int define_gf_group_structure_4(AV1_COMP *cpi) {
 static void define_gf_group_structure(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
 
-#if USE_GF16_MULTI_LAYER
-  if (rc->baseline_gf_interval == 16) {
-    define_gf_group_structure_16(cpi);
-    return;
-  }
-#endif  // USE_GF16_MULTI_LAYER
 #if USE_SYMM_MULTI_LAYER
-  const int valid_customized_gf_length = rc->baseline_gf_interval == 4 ||
-                                         rc->baseline_gf_interval == 8 ||
-                                         rc->baseline_gf_interval == 16;
+  const int valid_customized_gf_length =
+      rc->baseline_gf_interval >= 4 &&
+      rc->baseline_gf_interval <= MAX_PYRAMID_SIZE;
   // used the new structure only if extra_arf is allowed
   if (valid_customized_gf_length && rc->source_alt_ref_pending &&
       cpi->extra_arf_allowed > 0) {
@@ -2685,6 +2127,18 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
   gf_group->brf_src_offset[frame_index] = 0;
 }
 
+#if USE_SYMM_MULTI_LAYER
+#define LEAF_REDUCTION_FACTOR 0.75f
+#define LVL_3_BOOST_FACTOR 0.8f
+#define LVL_2_BOOST_FACTOR 0.3f
+
+static float_t lvl_budget_factor[MAX_PYRAMID_LVL - 1][MAX_PYRAMID_LVL - 1] = {
+  { 1, 0, 0 },
+  { LVL_3_BOOST_FACTOR, 0, 0 },  // Leaking budget works better
+  { LVL_3_BOOST_FACTOR, (1 - LVL_3_BOOST_FACTOR) * LVL_2_BOOST_FACTOR,
+    (1 - LVL_3_BOOST_FACTOR) * (1 - LVL_2_BOOST_FACTOR) }
+};
+#endif  // USE_SYMM_MULTI_LAYER
 static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
                                    double group_error, int gf_arf_bits) {
   RATE_CONTROL *const rc = &cpi->rc;
@@ -2771,20 +2225,39 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
       // BIPRED_UPDATE frames need to be further adjusted.
       gf_group->bit_allocation[frame_index] = target_frame_size;
 #if USE_SYMM_MULTI_LAYER
-    } else if (cpi->new_bwdref_update_rule == 1 &&
+    } else if (cpi->new_bwdref_update_rule &&
                gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) {
+      assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL &&
+             gf_group->pyramid_height >= 0 &&
+             "non-valid height for a pyramid structure");
+
       int arf_pos = gf_group->arf_pos_in_gf[frame_index];
       gf_group->bit_allocation[frame_index] = 0;
 
-      // Tried boosting up the allocated bits on backward reference frame
-      // by (target_frame_size >> 2) as in the original setting. However it
-      // does not bring gains for pyramid structure with GF length = 16.
       gf_group->bit_allocation[arf_pos] = target_frame_size;
-#endif
+#if MULTI_LVL_BOOST_VBR_CQ
+      const int pyr_h = gf_group->pyramid_height - 2;
+      const int this_lvl = gf_group->pyramid_level[arf_pos];
+      const int dist2top = gf_group->pyramid_height - 1 - this_lvl;
+
+      const float_t budget =
+          LEAF_REDUCTION_FACTOR * gf_group->pyramid_lvl_nodes[0];
+      const float_t lvl_boost = budget * lvl_budget_factor[pyr_h][dist2top] /
+                                gf_group->pyramid_lvl_nodes[this_lvl];
+
+      gf_group->bit_allocation[arf_pos] += (int)(target_frame_size * lvl_boost);
+#endif  // MULTI_LVL_BOOST_VBR_CQ
+#endif  // USE_SYMM_MULTI_LAYER
     } else {
       assert(gf_group->update_type[frame_index] == LF_UPDATE ||
              gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE);
       gf_group->bit_allocation[frame_index] = target_frame_size;
+#if MULTI_LVL_BOOST_VBR_CQ
+      if (cpi->new_bwdref_update_rule) {
+        gf_group->bit_allocation[frame_index] -=
+            (int)(target_frame_size * LEAF_REDUCTION_FACTOR);
+      }
+#endif  // MULTI_LVL_BOOST_VBR_CQ
     }
 
     ++frame_index;
@@ -2833,9 +2306,11 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   int i;
 
   double boost_score = 0.0;
-#if !FIX_GF_INTERVAL_LENGTH
+#if !CONFIG_FIX_GF_LENGTH
   double old_boost_score = 0.0;
   double mv_ratio_accumulator_thresh;
+  int active_max_gf_interval;
+  int active_min_gf_interval;
 #endif
   double gf_group_err = 0.0;
 #if GROUP_ADAPTIVE_MAXQ
@@ -2862,8 +2337,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   int f_boost = 0;
   int b_boost = 0;
   int flash_detected;
-  int active_max_gf_interval;
-  int active_min_gf_interval;
   int64_t gf_group_bits;
   double gf_group_error_left;
   int gf_arf_bits;
@@ -2898,11 +2371,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     gf_group_skip_pct -= this_frame->intra_skip_pct;
     gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
   }
-#if !FIX_GF_INTERVAL_LENGTH
+#if !CONFIG_FIX_GF_LENGTH
   // Motion breakout threshold for loop below depends on image size.
   mv_ratio_accumulator_thresh =
       (cpi->initial_height + cpi->initial_width) / 4.0;
-#endif
   // Set a maximum and minimum interval for the GF group.
   // If the image appears almost completely static we can extend beyond this.
   {
@@ -2915,23 +2387,19 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     if (active_min_gf_interval > rc->max_gf_interval)
       active_min_gf_interval = rc->max_gf_interval;
 
-    if (cpi->multi_arf_allowed) {
+    // The value chosen depends on the active Q range. At low Q we have
+    // bits to spare and are better with a smaller interval and smaller boost.
+    // At high Q when there are few bits to spare we are better with a longer
+    // interval to spread the cost of the GF.
+    active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6));
+
+    // We have: active_min_gf_interval <= rc->max_gf_interval
+    if (active_max_gf_interval < active_min_gf_interval)
+      active_max_gf_interval = active_min_gf_interval;
+    else if (active_max_gf_interval > rc->max_gf_interval)
       active_max_gf_interval = rc->max_gf_interval;
-    } else {
-      // The value chosen depends on the active Q range. At low Q we have
-      // bits to spare and are better with a smaller interval and smaller boost.
-      // At high Q when there are few bits to spare we are better with a longer
-      // interval to spread the cost of the GF.
-      active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6));
-
-      // We have: active_min_gf_interval <= rc->max_gf_interval
-      if (active_max_gf_interval < active_min_gf_interval)
-        active_max_gf_interval = active_min_gf_interval;
-      else if (active_max_gf_interval > rc->max_gf_interval)
-        active_max_gf_interval = rc->max_gf_interval;
-    }
   }
-
+#endif  // !CONFIG_FIX_GF_LENGTH
   double avg_sr_coded_error = 0;
   double avg_raw_err_stdev = 0;
   int non_zero_stdev_count = 0;
@@ -2990,10 +2458,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     boost_score +=
         decay_accumulator *
         calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST);
-#if FIX_GF_INTERVAL_LENGTH
+#if CONFIG_FIX_GF_LENGTH
     if (i == (FIXED_GF_LENGTH + 1)) break;
 #else
-    // Skip breaking condition for FIX_GF_INTERVAL_LENGTH
+    // Skip breaking condition for CONFIG_FIX_GF_LENGTH
     // Break out conditions.
     if (
         // Break at active_max_gf_interval unless almost totally static.
@@ -3017,7 +2485,7 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       }
     }
     old_boost_score = boost_score;
-#endif  // FIX_GF_INTERVAL_LENGTH
+#endif  // CONFIG_FIX_GF_LENGTH
     *this_frame = next_frame;
   }
   twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
@@ -3030,44 +2498,116 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   assert(num_mbs > 0);
   if (i) avg_sr_coded_error /= i;
 
+  if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count;
+
+  // Disable extra altrefs and backward refs for "still" gf group:
+  //   zero_motion_accumulator: minimum percentage of (0,0) motion;
+  //   avg_sr_coded_error:      average of the SSE per pixel of each frame;
+  //   avg_raw_err_stdev:       average of the standard deviation of (0,0)
+  //                            motion error per block of each frame.
+  const int disable_bwd_extarf =
+      (zero_motion_accumulator > MIN_ZERO_MOTION &&
+       avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
+       avg_raw_err_stdev < MAX_RAW_ERR_VAR);
+
+  if (disable_bwd_extarf) cpi->extra_arf_allowed = 0;
+
+#define REDUCE_GF_LENGTH_THRESH 4
+#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9
+#define REDUCE_GF_LENGTH_BY 1
+  int alt_offset = 0;
+#if REDUCE_LAST_GF_LENGTH
+  // TODO(weitinglin): The length reduction stretagy is tweaking using AOM_Q
+  // mode, and hurting the performance of VBR mode. We need to investigate how
+  // to adjust GF length for other modes.
+
+  int allow_gf_length_reduction =
+      cpi->oxcf.rc_mode == AOM_Q || cpi->extra_arf_allowed == 0;
+
+  // We are going to have an alt ref, but we don't have do adjustment for
+  // lossless mode
+  if (allow_alt_ref && allow_gf_length_reduction &&
+      (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval) &&
+      !is_lossless_requested(&cpi->oxcf)) {
+    // adjust length of this gf group if one of the following condition met
+    // 1: only one overlay frame left and this gf is too long
+    // 2: next gf group is too short to have arf compared to the current gf
+
+    // maximum length of next gf group
+    const int next_gf_len = rc->frames_to_key - i;
+    const int single_overlay_left =
+        next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH;
+    // the next gf is probably going to have a ARF but it will be shorter than
+    // this gf
+    const int unbalanced_gf =
+        i > REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+        next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+        next_gf_len + 1 >= rc->min_gf_interval;
+
+    if (single_overlay_left || unbalanced_gf) {
+      // Note: Tried roll_back = DIVIDE_AND_ROUND(i, 8), but is does not work
+      // better in the current setting
+      const int roll_back = REDUCE_GF_LENGTH_BY;
+      alt_offset = -roll_back;
+      i -= roll_back;
+    }
+  }
+#endif
+
   // Should we use the alternate reference frame.
   if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
       (i >= rc->min_gf_interval)) {
     // Calculate the boost for alt ref.
     rc->gfu_boost =
-        calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
+        calc_arf_boost(cpi, alt_offset, (i - 1), (i - 1), &f_boost, &b_boost);
     rc->source_alt_ref_pending = 1;
+
+    // do not replace ARFs with overlay frames, and keep it as GOLDEN_REF
+    cpi->preserve_arf_as_gld = 1;
   } else {
     rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST);
     rc->source_alt_ref_pending = 0;
+    cpi->preserve_arf_as_gld = 0;
   }
 
   // Set the interval until the next gf.
-  if (cpi->oxcf.fwd_kf_enabled) {
-    // Ensure the gf group before the next keyframe will contain an altref
-    if ((rc->frames_to_key - i < rc->min_gf_interval) &&
-        (rc->frames_to_key != i)) {
-      rc->baseline_gf_interval = AOMMIN(rc->frames_to_key - rc->min_gf_interval,
-                                        rc->static_scene_max_gf_interval);
-    } else {
+  // If forward keyframes are enabled, ensure the final gf group obeys the
+  // MIN_FWD_KF_INTERVAL.
+  if (cpi->oxcf.fwd_kf_enabled &&
+      ((twopass->stats_in - i + rc->frames_to_key) < twopass->stats_in_end)) {
+    if (i == rc->frames_to_key) {
       rc->baseline_gf_interval = i;
+      // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL
+    } else if ((rc->frames_to_key - i <
+                AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) &&
+               (rc->frames_to_key != i)) {
+      // if possible, merge the last two gf groups
+      if (rc->frames_to_key <= MAX_PYRAMID_SIZE) {
+        rc->baseline_gf_interval = rc->frames_to_key;
+        // if merging the last two gf groups creates a group that is too long,
+        // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL
+      } else {
+        rc->baseline_gf_interval = rc->frames_to_key - MIN_FWD_KF_INTERVAL;
+      }
+    } else {
+      rc->baseline_gf_interval =
+          i - (is_key_frame || rc->source_alt_ref_pending);
     }
   } else {
     rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
   }
-  if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count;
 
-  // Disable extra altrefs and backward refs for "still" gf group:
-  //   zero_motion_accumulator: minimum percentage of (0,0) motion;
-  //   avg_sr_coded_error:      average of the SSE per pixel of each frame;
-  //   avg_raw_err_stdev:       average of the standard deviation of (0,0)
-  //                            motion error per block of each frame.
-  const int disable_bwd_extarf =
-      (zero_motion_accumulator > MIN_ZERO_MOTION &&
-       avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
-       avg_raw_err_stdev < MAX_RAW_ERR_VAR);
-
-  if (disable_bwd_extarf) cpi->extra_arf_allowed = 0;
+#if REDUCE_LAST_ALT_BOOST
+#define LAST_ALR_BOOST_FACTOR 0.2f
+  rc->arf_boost_factor = 1.0;
+  if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf)) {
+    // Reduce the boost of altref in the last gf group
+    if (rc->frames_to_key - i == REDUCE_GF_LENGTH_BY ||
+        rc->frames_to_key - i == 0) {
+      rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
+    }
+  }
+#endif
 
   if (!cpi->extra_arf_allowed) {
     cpi->num_extra_arfs = 0;
@@ -3439,6 +2979,11 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // how many bits to spend on it.
   decay_accumulator = 1.0;
   boost_score = 0.0;
+  const double kf_max_boost =
+      cpi->oxcf.rc_mode == AOM_Q
+          ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST),
+                   KF_MAX_FRAME_BOOST)
+          : KF_MAX_FRAME_BOOST;
   for (i = 0; i < (rc->frames_to_key - 1); ++i) {
     if (EOF == input_stats(twopass, &next_frame)) break;
 
@@ -3450,7 +2995,7 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     if ((i <= rc->max_gf_interval) ||
         ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
       const double frame_boost =
-          calc_frame_boost(cpi, this_frame, 0, KF_MAX_BOOST);
+          calc_frame_boost(cpi, this_frame, 0, kf_max_boost);
 
       // How fast is prediction quality decaying.
       if (!detect_flash(twopass, 0)) {
@@ -3513,147 +3058,6 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   twopass->modified_error_left -= kf_group_err;
 }
 
-#if USE_GF16_MULTI_LAYER
-// === GF Group of 16 ===
-void av1_ref_frame_map_idx_updates(AV1_COMP *cpi, int gf_frame_index) {
-  TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &twopass->gf_group;
-
-  int ref_fb_idx_prev[REF_FRAMES];
-  int ref_fb_idx_curr[REF_FRAMES];
-
-  for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
-    ref_fb_idx_prev[ref_frame] = cpi->ref_fb_idx[ref_frame];
-  }
-
-  // Update map index for each reference frame
-  for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) {
-    int ref_frame = gf_group->ref_fb_idx_map[gf_frame_index][ref_idx];
-    ref_fb_idx_curr[ref_idx] = ref_fb_idx_prev[ref_frame - LAST_FRAME];
-  }
-
-  for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
-    cpi->ref_fb_idx[ref_frame] = ref_fb_idx_curr[ref_frame];
-  }
-}
-
-// Define the reference buffers that will be updated post encode.
-static void configure_buffer_updates_16(AV1_COMP *cpi) {
-  TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &twopass->gf_group;
-
-  if (gf_group->update_type[gf_group->index] == KF_UPDATE) {
-    cpi->refresh_fb_idx = 0;
-
-    cpi->refresh_last_frame = 1;
-    cpi->refresh_golden_frame = 1;
-    cpi->refresh_bwd_ref_frame = 1;
-    cpi->refresh_alt2_ref_frame = 1;
-    cpi->refresh_alt_ref_frame = 1;
-
-    return;
-  }
-
-  // Update reference frame map indexes
-  av1_ref_frame_map_idx_updates(cpi, gf_group->index);
-
-  // Update refresh index
-  switch (gf_group->refresh_idx[gf_group->index]) {
-    case LAST_FRAME:
-      cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST_FRAME - LAST_FRAME];
-      break;
-
-    case LAST2_FRAME:
-      cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST2_FRAME - LAST_FRAME];
-      break;
-
-    case LAST3_FRAME:
-      cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST3_FRAME - LAST_FRAME];
-      break;
-
-    case GOLDEN_FRAME:
-      cpi->refresh_fb_idx = cpi->ref_fb_idx[GOLDEN_FRAME - 1];
-      break;
-
-    case BWDREF_FRAME:
-      cpi->refresh_fb_idx = cpi->ref_fb_idx[BWDREF_FRAME - 1];
-      break;
-
-    case ALTREF2_FRAME:
-      cpi->refresh_fb_idx = cpi->ref_fb_idx[ALTREF2_FRAME - 1];
-      break;
-
-    case ALTREF_FRAME:
-      cpi->refresh_fb_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
-      break;
-
-    case REF_FRAMES:
-      cpi->refresh_fb_idx = cpi->ref_fb_idx[REF_FRAMES - 1];
-      break;
-
-    default: assert(0); break;
-  }
-
-  // Update refresh flags
-  switch (gf_group->refresh_flag[gf_group->index]) {
-    case LAST_FRAME:
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-      break;
-
-    case GOLDEN_FRAME:
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 1;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-      break;
-
-    case BWDREF_FRAME:
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 1;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-      break;
-
-    case ALTREF2_FRAME:
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 1;
-      cpi->refresh_alt_ref_frame = 0;
-      break;
-
-    case ALTREF_FRAME:
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 1;
-      break;
-
-    default: assert(0); break;
-  }
-
-  switch (gf_group->update_type[gf_group->index]) {
-    case BRF_UPDATE: cpi->rc.is_bwd_ref_frame = 1; break;
-
-    case LAST_BIPRED_UPDATE: cpi->rc.is_last_bipred_frame = 1; break;
-
-    case BIPRED_UPDATE: cpi->rc.is_bipred_frame = 1; break;
-
-    case INTNL_OVERLAY_UPDATE: cpi->rc.is_src_frame_ext_arf = 1;
-    case OVERLAY_UPDATE: cpi->rc.is_src_frame_alt_ref = 1; break;
-
-    default: break;
-  }
-}
-#endif  // USE_GF16_MULTI_LAYER
-
 // Define the reference buffers that will be updated post encode.
 static void configure_buffer_updates(AV1_COMP *cpi) {
   TWO_PASS *const twopass = &cpi->twopass;
@@ -3667,14 +3071,6 @@ static void configure_buffer_updates(AV1_COMP *cpi) {
   cpi->rc.is_bipred_frame = 0;
   cpi->rc.is_src_frame_ext_arf = 0;
 
-#if USE_GF16_MULTI_LAYER
-  RATE_CONTROL *const rc = &cpi->rc;
-  if (rc->baseline_gf_interval == 16) {
-    configure_buffer_updates_16(cpi);
-    return;
-  }
-#endif  // USE_GF16_MULTI_LAYER
-
   switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
     case KF_UPDATE:
       cpi->refresh_last_frame = 1;
@@ -3979,8 +3375,7 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
                             : cpi->common.MBs;
     // The multiplication by 256 reverses a scaling factor of (>> 8)
     // applied when combining MB error values for the frame.
-    twopass->mb_av_energy =
-        log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0);
+    twopass->mb_av_energy = log((this_frame.intra_error / num_mbs) + 1.0);
     twopass->frame_avg_haar_energy =
         log((this_frame.frame_avg_wavelet_energy / num_mbs) + 1.0);
   }
@@ -4020,9 +3415,6 @@ void av1_twopass_postencode_update(AV1_COMP *cpi) {
   }
   twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0);
 
-  // Increment the gf group index ready for the next frame.
-  ++twopass->gf_group.index;
-
   // If the rate control is drifting consider adjustment to min or maxq.
   if ((cpi->oxcf.rc_mode != AOM_Q) &&
       (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) &&
diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h
index b0c1a21e4..4b7325ae2 100644
--- a/third_party/aom/av1/encoder/firstpass.h
+++ b/third_party/aom/av1/encoder/firstpass.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_FIRSTPASS_H_
-#define AV1_ENCODER_FIRSTPASS_H_
+#ifndef AOM_AV1_ENCODER_FIRSTPASS_H_
+#define AOM_AV1_ENCODER_FIRSTPASS_H_
 
 #include "av1/common/enums.h"
 #include "av1/common/onyxc_int.h"
@@ -47,15 +47,7 @@ typedef struct {
 //       number of bi-predictive frames.
 #define BFG_INTERVAL 2
 // The maximum number of extra ALTREF's except ALTREF_FRAME
-// NOTE: REF_FRAMES indicates the maximum number of frames that may be buffered
-//       to serve as references. Currently REF_FRAMES == 8.
-#define USE_GF16_MULTI_LAYER 0
-
-#if USE_GF16_MULTI_LAYER
-#define MAX_EXT_ARFS (REF_FRAMES - BWDREF_FRAME)
-#else  // !USE_GF16_MULTI_LAYER
 #define MAX_EXT_ARFS (REF_FRAMES - BWDREF_FRAME - 1)
-#endif  // USE_GF16_MULTI_LAYER
 
 #define MIN_EXT_ARF_INTERVAL 4
 
@@ -126,6 +118,7 @@ typedef struct {
   unsigned char arf_pos_in_gf[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char pyramid_level[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char pyramid_height;
+  unsigned char pyramid_lvl_nodes[MAX_PYRAMID_LVL];
 #endif
   unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1];
@@ -197,10 +190,6 @@ void av1_configure_buffer_updates_firstpass(struct AV1_COMP *cpi,
 // Post encode update of the rate control parameters for 2-pass
 void av1_twopass_postencode_update(struct AV1_COMP *cpi);
 
-#if USE_GF16_MULTI_LAYER
-void av1_ref_frame_map_idx_updates(struct AV1_COMP *cpi, int gf_frame_index);
-#endif  // USE_GF16_MULTI_LAYER
-
 static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
   if (arf_pending && MAX_EXT_ARFS > 0)
     return interval >= MIN_EXT_ARF_INTERVAL * (MAX_EXT_ARFS + 1)
@@ -216,4 +205,4 @@ static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_FIRSTPASS_H_
+#endif  // AOM_AV1_ENCODER_FIRSTPASS_H_
diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c
index f07d1bc00..e9f8b0bb4 100644
--- a/third_party/aom/av1/encoder/global_motion.c
+++ b/third_party/aom/av1/encoder/global_motion.c
@@ -32,8 +32,8 @@
 // Border over which to compute the global motion
 #define ERRORADV_BORDER 0
 
-static const double erroradv_tr[] = { 0.75, 0.70, 0.65 };
-static const double erroradv_prod_tr[] = { 22000, 20000, 18000 };
+static const double erroradv_tr[] = { 0.65, 0.60, 0.55 };
+static const double erroradv_prod_tr[] = { 20000, 18000, 16000 };
 
 int is_enough_erroradvantage(double best_erroradvantage, int params_cost,
                              int erroradv_type) {
diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h
index 2c15753fd..c7c016c43 100644
--- a/third_party/aom/av1/encoder/global_motion.h
+++ b/third_party/aom/av1/encoder/global_motion.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_GLOBAL_MOTION_H_
-#define AV1_ENCODER_GLOBAL_MOTION_H_
+#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_H_
+#define AOM_AV1_ENCODER_GLOBAL_MOTION_H_
 
 #include "aom/aom_integer.h"
 #include "aom_scale/yv12config.h"
@@ -61,4 +61,4 @@ int compute_global_motion_feature_based(TransformationType type,
 #ifdef __cplusplus
 }  // extern "C"
 #endif
-#endif  // AV1_ENCODER_GLOBAL_MOTION_H_
+#endif  // AOM_AV1_ENCODER_GLOBAL_MOTION_H_
diff --git a/third_party/aom/av1/encoder/grain_test_vectors.h b/third_party/aom/av1/encoder/grain_test_vectors.h
index 45632da9b..945dc3733 100644
--- a/third_party/aom/av1/encoder/grain_test_vectors.h
+++ b/third_party/aom/av1/encoder/grain_test_vectors.h
@@ -8,8 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AV1_GRAIN_TEST_VECTORS_H_
-#define AV1_GRAIN_TEST_VECTORS_H_
+#ifndef AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
+#define AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
 
 /* Test vectors for emulation of different film grain types.
  * Note that bit depth would be derived from the bitstream and
@@ -778,4 +778,4 @@ static aom_film_grain_t film_grain_test_vectors[16] = {
       45231 /* random_seed */
   },
 };
-#endif  // AV1_GRAIN_TEST_VECTORS_H_
+#endif  // AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
diff --git a/third_party/aom/av1/encoder/hash.h b/third_party/aom/av1/encoder/hash.h
index 8b6227540..826c004d6 100644
--- a/third_party/aom/av1/encoder/hash.h
+++ b/third_party/aom/av1/encoder/hash.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_HASH_H_
-#define AV1_ENCODER_HASH_H_
+#ifndef AOM_AV1_ENCODER_HASH_H_
+#define AOM_AV1_ENCODER_HASH_H_
 
 #include "config/aom_config.h"
 
@@ -43,8 +43,10 @@ typedef struct _CRC32C {
 // init table for software version crc32c
 void av1_crc32c_calculator_init(CRC32C *p_crc32c);
 
+#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096)
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_HASH_H_
+#endif  // AOM_AV1_ENCODER_HASH_H_
diff --git a/third_party/aom/av1/encoder/hash_motion.c b/third_party/aom/av1/encoder/hash_motion.c
index f2ff5b495..e85a516e8 100644
--- a/third_party/aom/av1/encoder/hash_motion.c
+++ b/third_party/aom/av1/encoder/hash_motion.c
@@ -13,14 +13,12 @@
 
 #include "config/av1_rtcd.h"
 
+#include "av1/encoder/block.h"
 #include "av1/encoder/hash.h"
 #include "av1/encoder/hash_motion.h"
 
 static const int crc_bits = 16;
 static const int block_size_bits = 3;
-static CRC_CALCULATOR crc_calculator1;
-static CRC_CALCULATOR crc_calculator2;
-static int g_crc_initialized = 0;
 
 static void hash_table_clear_all(hash_table *p_hash_table) {
   if (p_hash_table->p_lookup_table == NULL) {
@@ -106,11 +104,11 @@ static int hash_block_size_to_index(int block_size) {
   }
 }
 
-void av1_hash_table_init(hash_table *p_hash_table) {
-  if (g_crc_initialized == 0) {
-    av1_crc_calculator_init(&crc_calculator1, 24, 0x5D6DCB);
-    av1_crc_calculator_init(&crc_calculator2, 24, 0x864CFB);
-    g_crc_initialized = 1;
+void av1_hash_table_init(hash_table *p_hash_table, MACROBLOCK *x) {
+  if (x->g_crc_initialized == 0) {
+    av1_crc_calculator_init(&x->crc_calculator1, 24, 0x5D6DCB);
+    av1_crc_calculator_init(&x->crc_calculator2, 24, 0x864CFB);
+    x->g_crc_initialized = 1;
   }
   p_hash_table->p_lookup_table = NULL;
 }
@@ -181,7 +179,8 @@ int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
 
 void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
                                        uint32_t *pic_block_hash[2],
-                                       int8_t *pic_block_same_info[3]) {
+                                       int8_t *pic_block_same_info[3],
+                                       MACROBLOCK *x) {
   const int width = 2;
   const int height = 2;
   const int x_end = picture->y_crop_width - width + 1;
@@ -201,9 +200,9 @@ void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
         pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p);
 
         pic_block_hash[0][pos] = av1_get_crc_value(
-            &crc_calculator1, (uint8_t *)p, length * sizeof(p[0]));
+            &x->crc_calculator1, (uint8_t *)p, length * sizeof(p[0]));
         pic_block_hash[1][pos] = av1_get_crc_value(
-            &crc_calculator2, (uint8_t *)p, length * sizeof(p[0]));
+            &x->crc_calculator2, (uint8_t *)p, length * sizeof(p[0]));
         pos++;
       }
       pos += width - 1;
@@ -220,9 +219,9 @@ void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
         pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p);
 
         pic_block_hash[0][pos] =
-            av1_get_crc_value(&crc_calculator1, p, length * sizeof(p[0]));
+            av1_get_crc_value(&x->crc_calculator1, p, length * sizeof(p[0]));
         pic_block_hash[1][pos] =
-            av1_get_crc_value(&crc_calculator2, p, length * sizeof(p[0]));
+            av1_get_crc_value(&x->crc_calculator2, p, length * sizeof(p[0]));
         pos++;
       }
       pos += width - 1;
@@ -235,7 +234,8 @@ void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
                                    uint32_t *src_pic_block_hash[2],
                                    uint32_t *dst_pic_block_hash[2],
                                    int8_t *src_pic_block_same_info[3],
-                                   int8_t *dst_pic_block_same_info[3]) {
+                                   int8_t *dst_pic_block_same_info[3],
+                                   MACROBLOCK *x) {
   const int pic_width = picture->y_crop_width;
   const int x_end = picture->y_crop_width - block_size + 1;
   const int y_end = picture->y_crop_height - block_size + 1;
@@ -254,14 +254,14 @@ void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
       p[2] = src_pic_block_hash[0][pos + src_size * pic_width];
       p[3] = src_pic_block_hash[0][pos + src_size * pic_width + src_size];
       dst_pic_block_hash[0][pos] =
-          av1_get_crc_value(&crc_calculator1, (uint8_t *)p, length);
+          av1_get_crc_value(&x->crc_calculator1, (uint8_t *)p, length);
 
       p[0] = src_pic_block_hash[1][pos];
       p[1] = src_pic_block_hash[1][pos + src_size];
       p[2] = src_pic_block_hash[1][pos + src_size * pic_width];
       p[3] = src_pic_block_hash[1][pos + src_size * pic_width + src_size];
       dst_pic_block_hash[1][pos] =
-          av1_get_crc_value(&crc_calculator2, (uint8_t *)p, length);
+          av1_get_crc_value(&x->crc_calculator2, (uint8_t *)p, length);
 
       dst_pic_block_same_info[0][pos] =
           src_pic_block_same_info[0][pos] &&
@@ -388,17 +388,9 @@ int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
   return 1;
 }
 
-// global buffer for hash value calculation of a block
-// used only in av1_get_block_hash_value()
-#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096)
-// [first hash/second hash]
-// [two buffers used ping-pong]
-// [num of 2x2 blocks in 128x128]
-static uint32_t hash_value_buffer[2][2][AOM_BUFFER_SIZE_FOR_BLOCK_HASH];
-
 void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
                               uint32_t *hash_value1, uint32_t *hash_value2,
-                              int use_highbitdepth) {
+                              int use_highbitdepth, MACROBLOCK *x) {
   uint32_t to_hash[4];
   const int add_value = hash_block_size_to_index(block_size) << crc_bits;
   assert(add_value >= 0);
@@ -415,10 +407,12 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
         get_pixels_in_1D_short_array_by_block_2x2(
             y16_src + y_pos * stride + x_pos, stride, pixel_to_hash);
         assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
-        hash_value_buffer[0][0][pos] = av1_get_crc_value(
-            &crc_calculator1, (uint8_t *)pixel_to_hash, sizeof(pixel_to_hash));
-        hash_value_buffer[1][0][pos] = av1_get_crc_value(
-            &crc_calculator2, (uint8_t *)pixel_to_hash, sizeof(pixel_to_hash));
+        x->hash_value_buffer[0][0][pos] =
+            av1_get_crc_value(&x->crc_calculator1, (uint8_t *)pixel_to_hash,
+                              sizeof(pixel_to_hash));
+        x->hash_value_buffer[1][0][pos] =
+            av1_get_crc_value(&x->crc_calculator2, (uint8_t *)pixel_to_hash,
+                              sizeof(pixel_to_hash));
       }
     }
   } else {
@@ -429,10 +423,10 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
         get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos,
                                                  stride, pixel_to_hash);
         assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
-        hash_value_buffer[0][0][pos] = av1_get_crc_value(
-            &crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash));
-        hash_value_buffer[1][0][pos] = av1_get_crc_value(
-            &crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash));
+        x->hash_value_buffer[0][0][pos] = av1_get_crc_value(
+            &x->crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash));
+        x->hash_value_buffer[1][0][pos] = av1_get_crc_value(
+            &x->crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash));
       }
     }
   }
@@ -457,24 +451,24 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
         assert(srcPos + src_sub_block_in_width + 1 <
                AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
         assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
-        to_hash[0] = hash_value_buffer[0][src_idx][srcPos];
-        to_hash[1] = hash_value_buffer[0][src_idx][srcPos + 1];
+        to_hash[0] = x->hash_value_buffer[0][src_idx][srcPos];
+        to_hash[1] = x->hash_value_buffer[0][src_idx][srcPos + 1];
         to_hash[2] =
-            hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width];
-        to_hash[3] =
-            hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width + 1];
+            x->hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width];
+        to_hash[3] = x->hash_value_buffer[0][src_idx]
+                                         [srcPos + src_sub_block_in_width + 1];
 
-        hash_value_buffer[0][dst_idx][dst_pos] = av1_get_crc_value(
-            &crc_calculator1, (uint8_t *)to_hash, sizeof(to_hash));
+        x->hash_value_buffer[0][dst_idx][dst_pos] = av1_get_crc_value(
+            &x->crc_calculator1, (uint8_t *)to_hash, sizeof(to_hash));
 
-        to_hash[0] = hash_value_buffer[1][src_idx][srcPos];
-        to_hash[1] = hash_value_buffer[1][src_idx][srcPos + 1];
+        to_hash[0] = x->hash_value_buffer[1][src_idx][srcPos];
+        to_hash[1] = x->hash_value_buffer[1][src_idx][srcPos + 1];
         to_hash[2] =
-            hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width];
-        to_hash[3] =
-            hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width + 1];
-        hash_value_buffer[1][dst_idx][dst_pos] = av1_get_crc_value(
-            &crc_calculator2, (uint8_t *)to_hash, sizeof(to_hash));
+            x->hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width];
+        to_hash[3] = x->hash_value_buffer[1][src_idx]
+                                         [srcPos + src_sub_block_in_width + 1];
+        x->hash_value_buffer[1][dst_idx][dst_pos] = av1_get_crc_value(
+            &x->crc_calculator2, (uint8_t *)to_hash, sizeof(to_hash));
         dst_pos++;
       }
     }
@@ -483,8 +477,6 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
     sub_block_in_width >>= 1;
   }
 
-  *hash_value1 = (hash_value_buffer[0][dst_idx][0] & crc_mask) + add_value;
-  *hash_value2 = hash_value_buffer[1][dst_idx][0];
+  *hash_value1 = (x->hash_value_buffer[0][dst_idx][0] & crc_mask) + add_value;
+  *hash_value2 = x->hash_value_buffer[1][dst_idx][0];
 }
-
-#undef AOM_BUFFER_SIZE_FOR_BLOCK_HASH
diff --git a/third_party/aom/av1/encoder/hash_motion.h b/third_party/aom/av1/encoder/hash_motion.h
index 8deb92eb6..df3ec3215 100644
--- a/third_party/aom/av1/encoder/hash_motion.h
+++ b/third_party/aom/av1/encoder/hash_motion.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_HASH_MOTION_H_
-#define AV1_ENCODER_HASH_MOTION_H_
+#ifndef AOM_AV1_ENCODER_HASH_MOTION_H_
+#define AOM_AV1_ENCODER_HASH_MOTION_H_
 
 #include "config/aom_config.h"
 
@@ -34,7 +34,7 @@ typedef struct _hash_table {
   Vector **p_lookup_table;
 } hash_table;
 
-void av1_hash_table_init(hash_table *p_hash_table);
+void av1_hash_table_init(hash_table *p_hash_table, struct macroblock *x);
 void av1_hash_table_destroy(hash_table *p_hash_table);
 void av1_hash_table_create(hash_table *p_hash_table);
 int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value);
@@ -44,13 +44,15 @@ int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
                             uint32_t hash_value2);
 void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
                                        uint32_t *pic_block_hash[2],
-                                       int8_t *pic_block_same_info[3]);
+                                       int8_t *pic_block_same_info[3],
+                                       struct macroblock *x);
 void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
                                    int block_size,
                                    uint32_t *src_pic_block_hash[2],
                                    uint32_t *dst_pic_block_hash[2],
                                    int8_t *src_pic_block_same_info[3],
-                                   int8_t *dst_pic_block_same_info[3]);
+                                   int8_t *dst_pic_block_same_info[3],
+                                   struct macroblock *x);
 void av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
                                                  uint32_t *pic_hash[2],
                                                  int8_t *pic_is_same,
@@ -67,10 +69,10 @@ int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
                                  int block_size, int x_start, int y_start);
 void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
                               uint32_t *hash_value1, uint32_t *hash_value2,
-                              int use_highbitdepth);
+                              int use_highbitdepth, struct macroblock *x);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_HASH_MOTION_H_
+#endif  // AOM_AV1_ENCODER_HASH_MOTION_H_
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
index 0922557d0..67898fd18 100644
--- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
@@ -121,15 +121,45 @@ static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
 static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
                                  int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
-  av1_fwd_txfm2d_8x16_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
-                        txfm_param->bd);
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  switch (tx_type) {
+      // use the c version for anything including identity for now
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_fwd_txfm2d_8x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+    default:
+      av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+  }
 }
 
 static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
                                  int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
-  av1_fwd_txfm2d_16x8_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
-                        txfm_param->bd);
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  switch (tx_type) {
+      // use the c version for anything including identity for now
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_fwd_txfm2d_16x8_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+    default:
+      av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+  }
 }
 
 static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
index 6155b255a..daabc7119 100644
--- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_HYBRID_FWD_TXFM_H_
-#define AV1_ENCODER_HYBRID_FWD_TXFM_H_
+#ifndef AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
+#define AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
 
 #include "config/aom_config.h"
 
@@ -28,4 +28,4 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_HYBRID_FWD_TXFM_H_
+#endif  // AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
diff --git a/third_party/aom/av1/encoder/lookahead.h b/third_party/aom/av1/encoder/lookahead.h
index 3897c2a6a..e55224cf7 100644
--- a/third_party/aom/av1/encoder/lookahead.h
+++ b/third_party/aom/av1/encoder/lookahead.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_LOOKAHEAD_H_
-#define AV1_ENCODER_LOOKAHEAD_H_
+#ifndef AOM_AV1_ENCODER_LOOKAHEAD_H_
+#define AOM_AV1_ENCODER_LOOKAHEAD_H_
 
 #include "aom_scale/yv12config.h"
 #include "aom/aom_integer.h"
@@ -103,4 +103,4 @@ unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx);
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_LOOKAHEAD_H_
+#endif  // AOM_AV1_ENCODER_LOOKAHEAD_H_
diff --git a/third_party/aom/av1/encoder/mathutils.h b/third_party/aom/av1/encoder/mathutils.h
index 23243dd9e..64f936176 100644
--- a/third_party/aom/av1/encoder/mathutils.h
+++ b/third_party/aom/av1/encoder/mathutils.h
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#ifndef AOM_AV1_ENCODER_MATHUTILS_H_
+#define AOM_AV1_ENCODER_MATHUTILS_H_
+
 #include <memory.h>
 #include <math.h>
 #include <stdio.h>
@@ -23,7 +26,7 @@ static INLINE int linsolve(int n, double *A, int stride, double *b, double *x) {
   double c;
   // Forward elimination
   for (k = 0; k < n - 1; k++) {
-    // Bring the largest magitude to the diagonal position
+    // Bring the largest magnitude to the diagonal position
     for (i = n - 1; i > k; i--) {
       if (fabs(A[(i - 1) * stride + k]) < fabs(A[i * stride + k])) {
         for (j = 0; j < n; j++) {
@@ -352,3 +355,5 @@ static INLINE int SVD(double *U, double *W, double *V, double *matx, int M,
 
   return 0;
 }
+
+#endif  // AOM_AV1_ENCODER_MATHUTILS_H_
diff --git a/third_party/aom/av1/encoder/mbgraph.c b/third_party/aom/av1/encoder/mbgraph.c
index 472173634..1a35ff77c 100644
--- a/third_party/aom/av1/encoder/mbgraph.c
+++ b/third_party/aom/av1/encoder/mbgraph.c
@@ -17,11 +17,12 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/system_state.h"
-#include "av1/encoder/segmentation.h"
-#include "av1/encoder/mcomp.h"
 #include "av1/common/blockd.h"
 #include "av1/common/reconinter.h"
 #include "av1/common/reconintra.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/segmentation.h"
 
 static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv,
                                               int mb_row, int mb_col) {
@@ -140,7 +141,7 @@ static int find_best_16x16_intra(AV1_COMP *cpi, PREDICTION_MODE *pbest_mode) {
 
   // calculate SATD for each intra prediction mode;
   // we're intentionally not doing 4x4, we just want a rough estimate
-  for (mode = DC_PRED; mode <= PAETH_PRED; mode++) {
+  for (mode = INTRA_MODE_START; mode < INTRA_MODE_END; mode++) {
     unsigned int err;
 
     xd->mi[0]->mode = mode;
diff --git a/third_party/aom/av1/encoder/mbgraph.h b/third_party/aom/av1/encoder/mbgraph.h
index 3e0a4fa9b..ba08476f7 100644
--- a/third_party/aom/av1/encoder/mbgraph.h
+++ b/third_party/aom/av1/encoder/mbgraph.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_MBGRAPH_H_
-#define AV1_ENCODER_MBGRAPH_H_
+#ifndef AOM_AV1_ENCODER_MBGRAPH_H_
+#define AOM_AV1_ENCODER_MBGRAPH_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -38,4 +38,4 @@ void av1_update_mbgraph_stats(struct AV1_COMP *cpi);
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_MBGRAPH_H_
+#endif  // AOM_AV1_ENCODER_MBGRAPH_H_
diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c
index c4572a341..8f6de9b53 100644
--- a/third_party/aom/av1/encoder/mcomp.c
+++ b/third_party/aom/av1/encoder/mcomp.c
@@ -29,6 +29,7 @@
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
 
 // #define NEW_DIAMOND_SEARCH
 
@@ -219,7 +220,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
     thismse = upsampled_pref_error(                                        \
         xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,    \
         pre(y, y_stride, r, c), y_stride, sp(c), sp(r), second_pred, mask, \
-        mask_stride, invert_mask, w, h, &sse);                             \
+        mask_stride, invert_mask, w, h, &sse, use_accurate_subpel_search); \
     v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);     \
     v += thismse;                                                          \
     if (v < besterr) {                                                     \
@@ -342,19 +343,19 @@ static unsigned int setup_center_error(
   if (second_pred != NULL) {
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
+      uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16);
       if (mask) {
-        aom_highbd_comp_mask_pred(comp_pred16, second_pred, w, h, y + offset,
+        aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y + offset,
                                   y_stride, mask, mask_stride, invert_mask);
       } else {
         if (xd->jcp_param.use_jnt_comp_avg)
-          aom_highbd_jnt_comp_avg_pred(comp_pred16, second_pred, w, h,
-                                       y + offset, y_stride, &xd->jcp_param);
+          aom_highbd_jnt_comp_avg_pred(comp_pred, second_pred, w, h, y + offset,
+                                       y_stride, &xd->jcp_param);
         else
-          aom_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
+          aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y + offset,
                                    y_stride);
       }
-      besterr =
-          vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
+      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
     } else {
       DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
       if (mask) {
@@ -648,51 +649,54 @@ static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *const cm,
                                 int subpel_x_q3, int subpel_y_q3,
                                 const uint8_t *second_pred, const uint8_t *mask,
                                 int mask_stride, int invert_mask, int w, int h,
-                                unsigned int *sse) {
+                                unsigned int *sse, int subpel_search) {
   unsigned int besterr;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
     if (second_pred != NULL) {
       if (mask) {
         aom_highbd_comp_mask_upsampled_pred(
-            xd, cm, mi_row, mi_col, mv, pred16, second_pred, w, h, subpel_x_q3,
-            subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask, xd->bd);
+            xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3,
+            subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask, xd->bd,
+            subpel_search);
       } else {
         if (xd->jcp_param.use_jnt_comp_avg)
           aom_highbd_jnt_comp_avg_upsampled_pred(
-              xd, cm, mi_row, mi_col, mv, pred16, second_pred, w, h,
-              subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd, &xd->jcp_param);
+              xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3,
+              subpel_y_q3, y, y_stride, xd->bd, &xd->jcp_param, subpel_search);
         else
-          aom_highbd_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16,
-                                             second_pred, w, h, subpel_x_q3,
-                                             subpel_y_q3, y, y_stride, xd->bd);
+          aom_highbd_comp_avg_upsampled_pred(
+              xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3,
+              subpel_y_q3, y, y_stride, xd->bd, subpel_search);
       }
     } else {
-      aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, w, h,
-                                subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd);
+      aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h,
+                                subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd,
+                                subpel_search);
     }
-
-    besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, sse);
+    besterr = vfp->vf(pred8, w, src, src_stride, sse);
   } else {
     DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
     if (second_pred != NULL) {
       if (mask) {
-        aom_comp_mask_upsampled_pred(
-            xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3,
-            subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask);
+        aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred,
+                                     second_pred, w, h, subpel_x_q3,
+                                     subpel_y_q3, y, y_stride, mask,
+                                     mask_stride, invert_mask, subpel_search);
       } else {
         if (xd->jcp_param.use_jnt_comp_avg)
           aom_jnt_comp_avg_upsampled_pred(
               xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3,
-              subpel_y_q3, y, y_stride, &xd->jcp_param);
+              subpel_y_q3, y, y_stride, &xd->jcp_param, subpel_search);
         else
           aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred,
                                       second_pred, w, h, subpel_x_q3,
-                                      subpel_y_q3, y, y_stride);
+                                      subpel_y_q3, y, y_stride, subpel_search);
       }
     } else {
       aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
-                         subpel_y_q3, y, y_stride);
+                         subpel_y_q3, y, y_stride, subpel_search);
     }
 
     besterr = vfp->vf(pred, w, src, src_stride, sse);
@@ -707,10 +711,11 @@ static unsigned int upsampled_setup_center_error(
     const int src_stride, const uint8_t *const y, int y_stride,
     const uint8_t *second_pred, const uint8_t *mask, int mask_stride,
     int invert_mask, int w, int h, int offset, int *mvjcost, int *mvcost[2],
-    unsigned int *sse1, int *distortion) {
-  unsigned int besterr = upsampled_pref_error(
-      xd, cm, mi_row, mi_col, bestmv, vfp, src, src_stride, y + offset,
-      y_stride, 0, 0, second_pred, mask, mask_stride, invert_mask, w, h, sse1);
+    unsigned int *sse1, int *distortion, int subpel_search) {
+  unsigned int besterr =
+      upsampled_pref_error(xd, cm, mi_row, mi_col, bestmv, vfp, src, src_stride,
+                           y + offset, y_stride, 0, 0, second_pred, mask,
+                           mask_stride, invert_mask, w, h, sse1, subpel_search);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
   return besterr;
@@ -781,7 +786,8 @@ int av1_find_best_sub_pixel_tree(
     besterr = upsampled_setup_center_error(
         xd, cm, mi_row, mi_col, bestmv, ref_mv, error_per_bit, vfp, src_address,
         src_stride, y, y_stride, second_pred, mask, mask_stride, invert_mask, w,
-        h, offset, mvjcost, mvcost, sse1, distortion);
+        h, offset, mvjcost, mvcost, sse1, distortion,
+        use_accurate_subpel_search);
   else
     besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                  src_address, src_stride, y, y_stride,
@@ -802,7 +808,8 @@ int av1_find_best_sub_pixel_tree(
           thismse = upsampled_pref_error(
               xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,
               pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred,
-              mask, mask_stride, invert_mask, w, h, &sse);
+              mask, mask_stride, invert_mask, w, h, &sse,
+              use_accurate_subpel_search);
         } else {
           thismse = estimate_upsampled_pref_error(
               xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc),
@@ -837,7 +844,8 @@ int av1_find_best_sub_pixel_tree(
         thismse = upsampled_pref_error(
             xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,
             pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred,
-            mask, mask_stride, invert_mask, w, h, &sse);
+            mask, mask_stride, invert_mask, w, h, &sse,
+            use_accurate_subpel_search);
       } else {
         thismse = estimate_upsampled_pref_error(
             xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc),
@@ -929,8 +937,8 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
   int16_t bc = mbmi->mv[0].as_mv.col;
   int16_t *tr = &mbmi->mv[0].as_mv.row;
   int16_t *tc = &mbmi->mv[0].as_mv.col;
-  WarpedMotionParams best_wm_params = mbmi->wm_params[0];
-  int best_num_proj_ref = mbmi->num_proj_ref[0];
+  WarpedMotionParams best_wm_params = mbmi->wm_params;
+  int best_num_proj_ref = mbmi->num_proj_ref;
   unsigned int bestmse;
   int minc, maxc, minr, maxr;
   const int start = cm->allow_high_precision_mv ? 0 : 4;
@@ -962,18 +970,18 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
         memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
         memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
         if (total_samples > 1)
-          mbmi->num_proj_ref[0] =
+          mbmi->num_proj_ref =
               selectSamples(&this_mv, pts, pts_inref, total_samples, bsize);
 
-        if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize, *tr,
-                             *tc, &mbmi->wm_params[0], mi_row, mi_col)) {
+        if (!find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, *tr,
+                             *tc, &mbmi->wm_params, mi_row, mi_col)) {
           thismse =
               av1_compute_motion_cost(cpi, x, bsize, mi_row, mi_col, &this_mv);
 
           if (thismse < bestmse) {
             best_idx = idx;
-            best_wm_params = mbmi->wm_params[0];
-            best_num_proj_ref = mbmi->num_proj_ref[0];
+            best_wm_params = mbmi->wm_params;
+            best_num_proj_ref = mbmi->num_proj_ref;
             bestmse = thismse;
           }
         }
@@ -990,8 +998,8 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
 
   *tr = br;
   *tc = bc;
-  mbmi->wm_params[0] = best_wm_params;
-  mbmi->num_proj_ref[0] = best_num_proj_ref;
+  mbmi->wm_params = best_wm_params;
+  mbmi->num_proj_ref = best_num_proj_ref;
   return bestmse;
 }
 
@@ -2013,8 +2021,16 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
                              const uint8_t *mask, int mask_stride,
                              int invert_mask, const MV *center_mv,
                              const uint8_t *second_pred) {
-  const MV neighbors[8] = { { -1, 0 },  { 0, -1 }, { 0, 1 },  { 1, 0 },
-                            { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } };
+  static const search_neighbors neighbors[8] = {
+    { { -1, 0 }, -1 * SEARCH_GRID_STRIDE_8P + 0 },
+    { { 0, -1 }, 0 * SEARCH_GRID_STRIDE_8P - 1 },
+    { { 0, 1 }, 0 * SEARCH_GRID_STRIDE_8P + 1 },
+    { { 1, 0 }, 1 * SEARCH_GRID_STRIDE_8P + 0 },
+    { { -1, -1 }, -1 * SEARCH_GRID_STRIDE_8P - 1 },
+    { { 1, -1 }, 1 * SEARCH_GRID_STRIDE_8P - 1 },
+    { { -1, 1 }, -1 * SEARCH_GRID_STRIDE_8P + 1 },
+    { { 1, 1 }, 1 * SEARCH_GRID_STRIDE_8P + 1 }
+  };
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
@@ -2022,6 +2038,10 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
   MV *best_mv = &x->best_mv.as_mv;
   unsigned int best_sad = INT_MAX;
   int i, j;
+  uint8_t do_refine_search_grid[SEARCH_GRID_STRIDE_8P * SEARCH_GRID_STRIDE_8P] =
+      { 0 };
+  int grid_center = SEARCH_GRID_CENTER_8P;
+  int grid_coord = grid_center;
 
   clamp_mv(best_mv, x->mv_limits.col_min, x->mv_limits.col_max,
            x->mv_limits.row_min, x->mv_limits.row_max);
@@ -2043,13 +2063,20 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
                  mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
   }
 
+  do_refine_search_grid[grid_coord] = 1;
+
   for (i = 0; i < search_range; ++i) {
     int best_site = -1;
 
     for (j = 0; j < 8; ++j) {
-      const MV mv = { best_mv->row + neighbors[j].row,
-                      best_mv->col + neighbors[j].col };
+      grid_coord = grid_center + neighbors[j].coord_offset;
+      if (do_refine_search_grid[grid_coord] == 1) {
+        continue;
+      }
+      const MV mv = { best_mv->row + neighbors[j].coord.row,
+                      best_mv->col + neighbors[j].coord.col };
 
+      do_refine_search_grid[grid_coord] = 1;
       if (is_mv_in(&x->mv_limits, &mv)) {
         unsigned int sad;
         if (mask) {
@@ -2079,8 +2106,9 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
     if (best_site == -1) {
       break;
     } else {
-      best_mv->row += neighbors[best_site].row;
-      best_mv->col += neighbors[best_site].col;
+      best_mv->row += neighbors[best_site].coord.row;
+      best_mv->col += neighbors[best_site].coord.col;
+      grid_center += neighbors[best_site].coord_offset;
     }
   }
   return best_sad;
@@ -2099,11 +2127,11 @@ static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) {
 }
 
 int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                          MV *mvp_full, int step_param, int error_per_bit,
+                          MV *mvp_full, int step_param, int method,
+                          int run_mesh_search, int error_per_bit,
                           int *cost_list, const MV *ref_mv, int var_max, int rd,
                           int x_pos, int y_pos, int intra) {
   const SPEED_FEATURES *const sf = &cpi->sf;
-  const SEARCH_METHODS method = sf->mv.search_method;
   const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
   int var = 0;
 
@@ -2168,11 +2196,35 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     default: assert(0 && "Invalid search method.");
   }
 
+  // Should we allow a follow on exhaustive search?
+  if (!run_mesh_search) {
+    if (method == NSTEP) {
+      if (is_exhaustive_allowed(cpi, x)) {
+        int exhuastive_thr = sf->exhaustive_searches_thresh;
+        exhuastive_thr >>=
+            10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+        // Threshold variance for an exhaustive full search.
+        if (var > exhuastive_thr) run_mesh_search = 1;
+      }
+    }
+  }
+
+  if (run_mesh_search) {
+    int var_ex;
+    MV tmp_mv_ex;
+    var_ex = full_pixel_exhaustive(cpi, x, &x->best_mv.as_mv, error_per_bit,
+                                   cost_list, fn_ptr, ref_mv, &tmp_mv_ex);
+    if (var_ex < var) {
+      var = var_ex;
+      x->best_mv.as_mv = tmp_mv_ex;
+    }
+  }
+
   if (method != NSTEP && rd && var < var_max)
     var = av1_get_mvpred_var(x, &x->best_mv.as_mv, ref_mv, fn_ptr, 1);
 
   do {
-    if (!av1_use_hash_me(&cpi->common)) break;
+    if (!intra || !av1_use_hash_me(&cpi->common)) break;
 
     // already single ME
     // get block size and original buffer of current block
@@ -2195,7 +2247,7 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
         av1_get_block_hash_value(
             what, what_stride, block_width, &hash_value1, &hash_value2,
-            x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+            x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, x);
 
         const int count = av1_hash_table_count(ref_frame_hash, hash_value1);
         // for intra, at lest one matching can be found, itself.
@@ -2279,7 +2331,8 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     MV this_mv = { r, c };                                                    \
     thismse = upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, &this_mv,     \
                                         mask, vfp, z, pre(y, y_stride, r, c), \
-                                        y_stride, sp(c), sp(r), w, h, &sse);  \
+                                        y_stride, sp(c), sp(r), w, h, &sse,   \
+                                        use_accurate_subpel_search);          \
     if ((v = MVC(r, c) + thismse) < besterr) {                                \
       besterr = v;                                                            \
       br = r;                                                                 \
@@ -2307,18 +2360,20 @@ static int upsampled_obmc_pref_error(
     MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
     const MV *const mv, const int32_t *mask, const aom_variance_fn_ptr_t *vfp,
     const int32_t *const wsrc, const uint8_t *const y, int y_stride,
-    int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse) {
+    int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse,
+    int subpel_search) {
   unsigned int besterr;
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
-    aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, w, h,
-                              subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd);
 
-    besterr = vfp->ovf(CONVERT_TO_BYTEPTR(pred16), w, wsrc, mask, sse);
+  DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
+    aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h,
+                              subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd,
+                              subpel_search);
+    besterr = vfp->ovf(pred8, w, wsrc, mask, sse);
   } else {
-    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
     aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
-                       subpel_y_q3, y, y_stride);
+                       subpel_y_q3, y, y_stride, subpel_search);
 
     besterr = vfp->ovf(pred, w, wsrc, mask, sse);
   }
@@ -2330,10 +2385,11 @@ static unsigned int upsampled_setup_obmc_center_error(
     const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc,
     const uint8_t *const y, int y_stride, int w, int h, int offset,
-    int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion) {
-  unsigned int besterr =
-      upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, bestmv, mask, vfp, wsrc,
-                                y + offset, y_stride, 0, 0, w, h, sse1);
+    int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion,
+    int subpel_search) {
+  unsigned int besterr = upsampled_obmc_pref_error(
+      xd, cm, mi_row, mi_col, bestmv, mask, vfp, wsrc, y + offset, y_stride, 0,
+      0, w, h, sse1, subpel_search);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
   return besterr;
@@ -2388,11 +2444,12 @@ int av1_find_best_obmc_sub_pixel_tree_up(
 
   bestmv->row *= 8;
   bestmv->col *= 8;
-  // use_accurate_subpel_search can be 0 or 1
+  // use_accurate_subpel_search can be 0 or 1 or 2
   if (use_accurate_subpel_search)
     besterr = upsampled_setup_obmc_center_error(
         xd, cm, mi_row, mi_col, mask, bestmv, ref_mv, error_per_bit, vfp, z, y,
-        y_stride, w, h, offset, mvjcost, mvcost, sse1, distortion);
+        y_stride, w, h, offset, mvjcost, mvcost, sse1, distortion,
+        use_accurate_subpel_search);
   else
     besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp,
                                       z, y, y_stride, offset, mvjcost, mvcost,
@@ -2408,7 +2465,8 @@ int av1_find_best_obmc_sub_pixel_tree_up(
         if (use_accurate_subpel_search) {
           thismse = upsampled_obmc_pref_error(
               xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address,
-              pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse);
+              pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse,
+              use_accurate_subpel_search);
         } else {
           thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc),
                               sp(tr), src_address, mask, &sse);
@@ -2439,7 +2497,8 @@ int av1_find_best_obmc_sub_pixel_tree_up(
       if (use_accurate_subpel_search) {
         thismse = upsampled_obmc_pref_error(
             xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address,
-            pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse);
+            pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse,
+            use_accurate_subpel_search);
       } else {
         thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr),
                             src_address, mask, &sse);
@@ -2643,11 +2702,12 @@ int obmc_diamond_search_sad(const MACROBLOCK *x, const search_site_config *cfg,
   return best_sad;
 }
 
-int av1_obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
-                                MV *mvp_full, int step_param, int sadpb,
-                                int further_steps, int do_refine,
-                                const aom_variance_fn_ptr_t *fn_ptr,
-                                const MV *ref_mv, MV *dst_mv, int is_second) {
+static int obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
+                                   MV *mvp_full, int step_param, int sadpb,
+                                   int further_steps, int do_refine,
+                                   const aom_variance_fn_ptr_t *fn_ptr,
+                                   const MV *ref_mv, MV *dst_mv,
+                                   int is_second) {
   const int32_t *wsrc = x->wsrc_buf;
   const int32_t *mask = x->mask_buf;
   MV temp_mv;
@@ -2704,6 +2764,31 @@ int av1_obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
   return bestsme;
 }
 
+int av1_obmc_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, MV *mvp_full,
+                               int step_param, int sadpb, int further_steps,
+                               int do_refine,
+                               const aom_variance_fn_ptr_t *fn_ptr,
+                               const MV *ref_mv, MV *dst_mv, int is_second) {
+  if (cpi->sf.obmc_full_pixel_search_level == 0) {
+    return obmc_full_pixel_diamond(cpi, x, mvp_full, step_param, sadpb,
+                                   further_steps, do_refine, fn_ptr, ref_mv,
+                                   dst_mv, is_second);
+  } else {
+    const int32_t *wsrc = x->wsrc_buf;
+    const int32_t *mask = x->mask_buf;
+    const int search_range = 8;
+    *dst_mv = *mvp_full;
+    clamp_mv(dst_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+             x->mv_limits.row_min, x->mv_limits.row_max);
+    int thissme = obmc_refining_search_sad(
+        x, wsrc, mask, dst_mv, sadpb, search_range, fn_ptr, ref_mv, is_second);
+    if (thissme < INT_MAX)
+      thissme = get_obmc_mvpred_var(x, wsrc, mask, dst_mv, ref_mv, fn_ptr, 1,
+                                    is_second);
+    return thissme;
+  }
+}
+
 // Note(yunqingwang): The following 2 functions are only used in the motion
 // vector unit test, which return extreme motion vectors allowed by the MV
 // limits.
diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h
index 539e8f4e4..a975218b0 100644
--- a/third_party/aom/av1/encoder/mcomp.h
+++ b/third_party/aom/av1/encoder/mcomp.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_MCOMP_H_
-#define AV1_ENCODER_MCOMP_H_
+#ifndef AOM_AV1_ENCODER_MCOMP_H_
+#define AOM_AV1_ENCODER_MCOMP_H_
 
 #include "av1/encoder/block.h"
 #include "aom_dsp/variance.h"
@@ -31,6 +31,11 @@ extern "C" {
 // for Block_16x16
 #define BORDER_MV_PIXELS_B16 (16 + AOM_INTERP_EXTEND)
 
+#define SEARCH_RANGE_8P 3
+#define SEARCH_GRID_STRIDE_8P (2 * SEARCH_RANGE_8P + 1)
+#define SEARCH_GRID_CENTER_8P \
+  (SEARCH_RANGE_8P * SEARCH_GRID_STRIDE_8P + SEARCH_RANGE_8P)
+
 // motion search site
 typedef struct search_site {
   MV mv;
@@ -43,6 +48,11 @@ typedef struct search_site_config {
   int searches_per_step;
 } search_site_config;
 
+typedef struct {
+  MV coord;
+  int coord_offset;
+} search_neighbors;
+
 void av1_init_dsmotion_compensation(search_site_config *cfg, int stride);
 void av1_init3smotion_compensation(search_site_config *cfg, int stride);
 
@@ -120,14 +130,15 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
 
 int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
                           BLOCK_SIZE bsize, MV *mvp_full, int step_param,
-                          int error_per_bit, int *cost_list, const MV *ref_mv,
-                          int var_max, int rd, int x_pos, int y_pos, int intra);
-
-int av1_obmc_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
-                                MV *mvp_full, int step_param, int sadpb,
-                                int further_steps, int do_refine,
-                                const aom_variance_fn_ptr_t *fn_ptr,
-                                const MV *ref_mv, MV *dst_mv, int is_second);
+                          int method, int run_mesh_search, int error_per_bit,
+                          int *cost_list, const MV *ref_mv, int var_max, int rd,
+                          int x_pos, int y_pos, int intra);
+
+int av1_obmc_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                               MV *mvp_full, int step_param, int sadpb,
+                               int further_steps, int do_refine,
+                               const aom_variance_fn_ptr_t *fn_ptr,
+                               const MV *ref_mv, MV *dst_mv, int is_second);
 int av1_find_best_obmc_sub_pixel_tree_up(
     MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
     MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
@@ -147,4 +158,4 @@ unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi,
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_MCOMP_H_
+#endif  // AOM_AV1_ENCODER_MCOMP_H_
diff --git a/third_party/aom/av1/encoder/ml.c b/third_party/aom/av1/encoder/ml.c
index 3a27e5845..d21def43a 100644
--- a/third_party/aom/av1/encoder/ml.c
+++ b/third_party/aom/av1/encoder/ml.c
@@ -10,7 +10,9 @@
  */
 
 #include <assert.h>
+#include <math.h>
 
+#include "aom_dsp/aom_dsp_common.h"
 #include "av1/encoder/ml.h"
 
 void av1_nn_predict(const float *features, const NN_CONFIG *nn_config,
@@ -55,3 +57,17 @@ void av1_nn_predict(const float *features, const NN_CONFIG *nn_config,
     weights += num_input_nodes;
   }
 }
+
+void av1_nn_softmax(const float *input, float *output, int n) {
+  // Softmax function is invariant to adding the same constant
+  // to all input values, so we subtract the maximum input to avoid
+  // possible overflow.
+  float max_inp = input[0];
+  for (int i = 1; i < n; i++) max_inp = AOMMAX(max_inp, input[i]);
+  float sum_out = 0.0f;
+  for (int i = 0; i < n; i++) {
+    output[i] = (float)exp(input[i] - max_inp);
+    sum_out += output[i];
+  }
+  for (int i = 0; i < n; i++) output[i] /= sum_out;
+}
diff --git a/third_party/aom/av1/encoder/ml.h b/third_party/aom/av1/encoder/ml.h
index 614cb60bb..cb8ef2871 100644
--- a/third_party/aom/av1/encoder/ml.h
+++ b/third_party/aom/av1/encoder/ml.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_ML_H_
-#define AV1_ENCODER_ML_H_
+#ifndef AOM_AV1_ENCODER_ML_H_
+#define AOM_AV1_ENCODER_ML_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -37,8 +37,13 @@ typedef struct {
 void av1_nn_predict(const float *features, const NN_CONFIG *nn_config,
                     float *output);
 
+// Applies the softmax normalization function to the input
+// to get a valid probability distribution in the output:
+// output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k]))
+void av1_nn_softmax(const float *input, float *output, int n);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_RD_H_
+#endif  // AOM_AV1_ENCODER_ML_H_
diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h
index bbdd50784..8b88c4755 100644
--- a/third_party/aom/av1/encoder/palette.h
+++ b/third_party/aom/av1/encoder/palette.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_PALETTE_H_
-#define AV1_ENCODER_PALETTE_H_
+#ifndef AOM_AV1_ENCODER_PALETTE_H_
+#define AOM_AV1_ENCODER_PALETTE_H_
 
 #include "av1/common/blockd.h"
 
@@ -93,4 +93,4 @@ int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
 }  // extern "C"
 #endif
 
-#endif /* AV1_ENCODER_PALETTE_H_ */
+#endif  // AOM_AV1_ENCODER_PALETTE_H_
diff --git a/third_party/aom/av1/encoder/partition_model_weights.h b/third_party/aom/av1/encoder/partition_model_weights.h
index 279d39495..437ea43f9 100644
--- a/third_party/aom/av1/encoder/partition_model_weights.h
+++ b/third_party/aom/av1/encoder/partition_model_weights.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_
-#define AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_
+#ifndef AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -1314,204 +1314,112 @@ static const NN_CONFIG av1_ab_partition_nnconfig_16 = {
 #define FEATURE_SIZE 18
 #define LABEL_SIZE 4
 
-static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 48] = {
-  0.121894f,  0.058485f,  0.702226f,  0.015457f,  -0.123380f, -0.573450f,
-  0.319576f,  0.118808f,  0.166057f,  0.526984f,  0.015211f,  -0.025050f,
-  0.085717f,  -0.028221f, -0.580062f, -0.270530f, -0.092371f, 0.037679f,
-  0.083573f,  0.007112f,  -0.358623f, -0.264443f, -0.064819f, 0.022013f,
-  -0.040077f, -0.291967f, -0.293100f, 0.072266f,  -0.270572f, -0.292253f,
-  -0.260105f, -0.294472f, -0.275752f, 0.054315f,  0.000085f,  0.105115f,
-  -0.363572f, -0.016542f, 0.185943f,  -0.359903f, 0.038765f,  -0.377668f,
-  0.172692f,  0.127749f,  -0.031275f, -0.242528f, -0.145880f, -0.055247f,
-  -0.000265f, -0.355224f, 0.089917f,  -0.377841f, -0.209766f, 0.030899f,
-  0.039546f,  -0.375030f, -0.041605f, 0.137677f,  0.021282f,  -0.150442f,
-  -0.189445f, 0.009293f,  -0.316033f, 0.038745f,  -0.278761f, 0.005692f,
-  -0.071763f, -0.302936f, -0.224572f, -0.211841f, 0.057503f,  0.005435f,
-  -0.930979f, 0.115513f,  0.689958f,  0.221318f,  1.003891f,  0.359540f,
-  -0.640534f, -0.162373f, -0.118105f, 0.205587f,  0.019710f,  0.025067f,
-  -0.025344f, 0.002831f,  0.033078f,  0.040175f,  -0.007502f, 0.026272f,
-  0.083443f,  -0.880884f, 0.436948f,  0.293297f,  0.051678f,  -0.133328f,
-  -0.180323f, 0.667835f,  0.070733f,  -0.003060f, -0.221804f, 0.146601f,
-  0.064024f,  0.056758f,  -0.077361f, 0.105587f,  -0.185500f, -0.133552f,
-  0.138269f,  0.165055f,  0.628284f,  0.846449f,  0.058825f,  0.223157f,
-  0.277896f,  -0.381303f, 0.408241f,  0.643301f,  0.067494f,  0.120822f,
-  -0.182491f, -0.111373f, -0.033374f, 0.131387f,  -0.114654f, 0.114318f,
-  0.094718f,  -0.052232f, 0.385903f,  1.212304f,  0.425305f,  -0.052993f,
-  0.291474f,  -0.319730f, 0.023090f,  -0.317259f, 0.011181f,  -0.034185f,
-  -0.100671f, 0.186185f,  -0.432511f, -0.115957f, -0.067746f, -0.177810f,
-  -0.226700f, 0.004464f,  0.006809f,  0.171360f,  -0.080723f, 0.099826f,
-  -0.062301f, -0.358755f, -0.202549f, -0.084616f, -0.042313f, -0.325560f,
-  0.010452f,  -0.341089f, -0.013566f, -0.340129f, 0.034675f,  -0.036518f,
-  -0.036473f, -0.192892f, 0.650235f,  0.609437f,  -0.160982f, 0.125535f,
-  -1.004575f, 0.521969f,  1.318091f,  0.614004f,  -0.106622f, -0.077453f,
-  -0.037328f, -0.081940f, 0.007640f,  0.026654f,  -0.080332f, -0.077356f,
-  -0.288170f, -0.319680f, -0.131712f, -0.150985f, 0.073218f,  0.089502f,
-  -0.280502f, 0.003941f,  -0.249937f, 0.244263f,  0.023269f,  0.080263f,
-  0.073172f,  -0.200036f, 0.022381f,  0.008592f,  -0.339517f, -0.135073f,
-  0.177199f,  0.208363f,  0.652360f,  0.272990f,  0.609535f,  0.145805f,
-  0.022527f,  -0.088378f, 0.205008f,  0.101021f,  -0.019673f, -0.252681f,
-  0.116034f,  -0.062052f, 0.009991f,  0.138933f,  -0.182428f, 0.052542f,
-  -0.350825f, -0.122654f, -0.154687f, 0.066747f,  0.021541f,  -0.212169f,
-  -0.087093f, -0.087488f, 0.178129f,  -0.146544f, 0.013919f,  -0.273899f,
-  0.223753f,  -0.187327f, -0.118795f, -0.191892f, -0.355979f, 0.023794f,
-  -0.135236f, 0.058918f,  0.069080f,  0.279287f,  0.369689f,  1.134526f,
-  0.659511f,  0.250223f,  0.286040f,  0.515284f,  0.067791f,  -0.156385f,
-  0.143283f,  0.050884f,  0.089956f,  -0.040850f, -0.003650f, -0.081162f,
-  0.086004f,  0.116578f,  0.826254f,  0.504869f,  -0.196022f, -0.207279f,
-  0.200503f,  -0.196801f, 0.008211f,  0.411158f,  -0.075855f, -0.036690f,
-  0.111519f,  -0.057838f, -0.005846f, 0.111067f,  0.174712f,  -0.078054f,
-  0.765897f,  0.018670f,  -0.306960f, -0.020034f, -0.332875f, 0.662707f,
-  -0.461233f, -1.007542f, -0.693995f, -1.243352f, -0.014745f, 0.004036f,
-  -0.009141f, 0.003325f,  -0.011233f, -0.000819f, 0.006369f,  0.002418f,
-  -0.035906f, -0.005135f, 1.073830f,  1.020736f,  -0.182611f, -1.038976f,
-  -0.226695f, -0.375663f, 0.364568f,  0.620995f,  -0.018615f, 0.011347f,
-  0.045786f,  0.041077f,  0.010886f,  -0.148428f, 0.028007f,  -0.022322f,
-  -0.165985f, 0.233315f,  -0.277531f, -0.329683f, -0.516967f, -0.390750f,
-  0.006948f,  0.133744f,  -0.375681f, -0.116877f, -0.009441f, -0.008597f,
-  -0.160679f, 0.102150f,  -0.142647f, -0.117501f, 0.035035f,  0.228687f,
-  -1.117397f, -0.005171f, -0.008708f, 0.413042f,  -0.298532f, 0.614909f,
-  -0.181084f, -0.711770f, 0.344033f,  0.287220f,  -0.112848f, -0.052866f,
-  -0.222466f, 0.025029f,  -0.107558f, 0.137036f,  -0.276661f, -0.038808f,
-  -0.057448f, 0.037563f,  0.526020f,  0.447997f,  0.288366f,  0.264815f,
-  0.319974f,  -0.193091f, 0.353830f,  0.412950f,  -0.280454f, 0.092737f,
-  0.070919f,  0.043336f,  0.041214f,  -0.052147f, 0.010860f,  0.191325f,
-  0.079783f,  -0.425672f, -0.053469f, -0.005495f, 0.184526f,  -0.166171f,
-  0.084459f,  -0.042165f, -0.261759f, -0.248723f, -0.073483f, -0.377884f,
-  -0.189614f, -0.054146f, -0.261279f, 0.196347f,  -0.087568f, 0.070533f,
-  -0.145492f, -0.041500f, -0.465861f, 0.077369f,  0.020645f,  -0.440232f,
-  -0.414585f, -0.168627f, -0.050011f, -0.336676f, -0.344943f, -0.288140f,
-  0.085513f,  -0.200425f, 0.218516f,  0.049604f,  -0.280952f, -0.242674f,
-  -1.969931f, 0.013374f,  -0.039643f, 1.113947f,  0.018568f,  0.916330f,
-  -0.302934f, -0.225816f, 0.189529f,  -0.361971f, 0.021073f,  -0.050143f,
-  -0.041415f, 0.015126f,  0.018091f,  -0.082401f, 0.017152f,  0.064856f,
-  0.156170f,  0.145323f,  -0.281409f, 0.213357f,  -0.058966f, 0.158668f,
-  0.033742f,  0.378820f,  -0.662875f, -0.455532f, -0.702928f, 0.234325f,
-  0.139627f,  -1.360650f, 0.040921f,  -0.044373f, -0.059999f, -0.048565f,
-  0.115339f,  -0.105888f, -0.170567f, -0.206097f, -0.349537f, 0.107941f,
-  -0.356286f, -0.374928f, 0.143257f,  -0.317790f, 0.079875f,  -0.359345f,
-  0.081321f,  -0.219772f, -0.077213f, 0.110624f,  -0.252329f, -0.266481f,
-  0.190135f,  0.121214f,  0.661064f,  -0.037820f, -0.373068f, -0.065209f,
-  -0.286154f, -0.120695f, -0.110670f, -0.193589f, -0.010867f, -0.048054f,
-  -0.032010f, 0.110627f,  0.054094f,  -0.884309f, -1.171623f, -0.386911f,
-  -0.756058f, 0.030362f,  0.563628f,  -0.334227f, -0.111213f, 1.143898f,
-  -0.940454f, 0.084510f,  0.671010f,  0.312244f,  -0.052592f, -0.014376f,
-  0.039965f,  -0.010763f, -0.114936f, -0.146020f, 0.015874f,  0.027439f,
-  -1.702315f, 0.148702f,  0.153021f,  0.363147f,  -0.488933f, 0.220772f,
-  0.640310f,  -0.173911f, -0.169523f, -0.082261f, -0.014854f, 0.024414f,
-  0.061041f,  -0.013998f, 0.086539f,  0.000466f,  0.037472f,  -0.010665f,
-  -0.326646f, 0.106971f,  0.405589f,  0.555345f,  -0.318315f, 0.526498f,
-  0.119246f,  0.022213f,  0.171237f,  0.214651f,  0.062904f,  -0.023764f,
-  0.011831f,  0.079644f,  -0.096530f, -0.054373f, -0.306309f, -0.203709f,
-  -0.353217f, -0.350005f, -0.329549f, 0.062679f,  -0.387625f, -0.237111f,
-  -0.025050f, -0.193987f, 0.002235f,  -0.380821f, -0.051036f, -0.136020f,
-  0.077989f,  -0.361691f, 0.120485f,  0.157746f,  0.073394f,  -0.284401f,
-  0.113221f,  0.109808f,  0.000197f,  0.122523f,  0.081411f,  -0.048544f,
-  -0.136577f, -0.007158f, -0.208952f, -0.276831f, 0.260479f,  -1.392915f,
-  -0.865248f, 0.114577f,  -0.000749f, -0.060338f, -0.091176f, -0.108421f,
-  0.221256f,  0.100176f,  -0.877560f, -1.248838f, 0.643005f,  0.064580f,
-  -0.049878f, 0.267988f,  -0.434340f, -0.299254f, -0.097572f, 0.009606f,
-  0.063810f,  -0.090525f, 0.027760f,  0.043484f,  0.041697f,  0.108024f,
-  -0.359586f, -0.197090f, 0.121397f,  0.152206f,  -0.391126f, -0.283145f,
-  0.008754f,  -0.059022f, -0.218745f, 0.043042f,  -0.056716f, 0.153051f,
-  -0.210372f, -0.029681f, -0.288354f, 0.065242f,  -0.189376f, 0.115013f,
-  -0.251488f, -0.533091f, 0.037768f,  -0.319107f, -0.161364f, -0.103967f,
-  0.063271f,  -0.313289f, -0.312093f, -0.045239f, 0.150607f,  0.001487f,
-  0.019602f,  -0.338031f, -0.036214f, 0.112736f,  -0.367762f, 0.122367f,
-  0.094670f,  0.175590f,  0.301041f,  -0.135257f, 0.539620f,  0.328619f,
-  -0.163971f, 0.137256f,  0.238805f,  0.483722f,  0.121353f,  0.083630f,
-  -0.283568f, 0.291661f,  -0.061122f, -0.195295f, 0.153459f,  -0.153727f,
-  -0.238839f, -0.071736f, 0.601437f,  -0.664072f, 0.230827f,  0.198753f,
-  -0.039196f, 0.206751f,  0.529020f,  0.904132f,  -0.219471f, 0.186694f,
-  -0.208608f, -0.093385f, -0.161617f, 0.003930f,  -0.429869f, -0.123563f,
-  0.626098f,  -0.002495f, -0.245511f, -1.069848f, 0.296115f,  -0.940267f,
-  -1.649122f, -0.512937f, -0.802874f, -1.000239f, -0.027629f, 0.020434f,
-  -0.003030f, 0.035986f,  -0.004812f, -0.009193f, -0.004644f, -0.024347f,
-  0.068439f,  -0.314339f, 0.095057f,  -0.212372f, 0.197523f,  -0.040878f,
-  -0.272164f, -0.243326f, -0.204955f, 0.157199f,  -0.049964f, -0.091537f,
-  -0.058012f, -0.306650f, 0.098621f,  -0.146778f, -0.154447f, -0.177889f,
-  -0.009698f, 0.025427f,  0.350576f,  -0.448237f, -0.068823f, 1.224960f,
-  -0.776883f, -0.692167f, -0.948497f, -0.492598f, 0.029440f,  -0.056460f,
-  0.021654f,  0.004352f,  0.041508f,  -0.027179f, 0.006789f,  -0.023573f,
-  0.207775f,  -0.280273f, -0.347984f, -0.129935f, 0.151512f,  -0.087294f,
-  -0.494352f, -0.341424f, 0.044084f,  -0.064080f, 0.073091f,  -0.145574f,
-  0.094715f,  -0.258786f, -0.020419f, -0.401823f, 0.009397f,  -0.138642f,
-  -0.034953f, -0.077419f, 0.636610f,  0.314980f,  1.110610f,  -0.343368f,
-  0.696647f,  -0.649667f, 0.653491f,  -0.096006f, -0.090469f, -0.066975f,
-  -0.105864f, -0.015666f, 0.102056f,  -0.105344f, -0.273495f, -0.014686f,
-  0.122031f,  0.139524f,  -1.042029f, -0.562510f, 0.885644f,  1.088059f,
-  0.189223f,  0.049404f,  -0.167371f, 0.018703f,  -0.208390f, -0.159002f,
-  -0.377130f, -0.151118f, 0.117861f,  0.026986f,  -0.032433f, 0.081603f,
-  -0.106729f, -0.040134f, 0.015161f,  0.290572f,  0.241446f,  1.390085f,
-  0.438915f,  -0.358097f, -0.171799f, 0.879758f,  -0.014110f, 0.029562f,
-  -0.073583f, -0.125817f, -0.036512f, -0.040275f, 0.037997f,  0.120979f,
-  0.064538f,  -0.038841f, 0.034797f,  0.110229f,  -0.239779f, -0.004558f,
-  0.226534f,  0.111286f,  -0.268198f, 0.237673f,  -0.328237f, -0.090774f,
-  -0.269690f, -0.202147f, -0.181808f, -0.305238f, 0.110058f,  -0.169217f,
-  -0.300125f, 0.069031f,  -0.081358f, -0.376174f, -0.349980f, 0.071443f,
-  -0.396278f, -0.389503f, -0.190410f, -0.014767f, -0.265229f, -0.099787f,
-  0.079847f,  -0.214580f, -0.235661f, -0.184227f, 0.111099f,  -0.083945f,
-  -0.153809f, -0.284092f, -0.132497f, -0.154841f, -0.517157f, -0.640603f,
-  -0.357036f, -0.486142f, -0.182819f, -0.475022f, 0.079282f,  0.081168f,
-  -0.120831f, -0.016048f, -0.232495f, 0.214329f,  -0.055058f, 0.032856f,
-  0.061753f,  0.003226f,  0.097028f,  0.084535f,  -1.563199f, 0.434928f,
-  -0.403710f, 0.520696f,  -0.401696f, 0.450568f,  -0.074121f, 0.076622f,
-  -0.098421f, 0.167036f,  -0.255250f, -0.526313f, -0.933693f, -0.558104f,
-  0.194341f,  0.173326f,  0.071112f,  -0.651961f, -1.327587f, -0.705289f,
-  -1.138889f, 0.197167f,  -0.714654f, -0.113891f, 0.080158f,  0.000301f,
-  0.057905f,  0.060718f,  -0.635995f, 0.100026f,  -0.038239f, -0.025530f,
-};
-
-static const float av1_4_partition_nn_bias_16_layer0[48] = {
-  -0.079252f, -0.083606f, -0.112759f, -0.071622f, 0.444562f,  0.215649f,
-  -0.337661f, -0.242379f, -0.053829f, 0.165168f,  -0.076613f, -0.190579f,
-  -0.060175f, -0.571661f, -0.454075f, -1.462711f, -0.161563f, -0.088748f,
-  -0.030279f, -0.456293f, -0.134473f, -0.194976f, 0.044373f,  -0.503954f,
-  -0.083563f, 0.123344f,  0.011821f,  0.085445f,  -0.050294f, -0.135194f,
-  0.057815f,  0.543558f,  -0.090602f, -0.104671f, -0.285075f, 0.354335f,
-  1.037007f,  -0.023879f, -0.025025f, -0.094408f, -0.101200f, -0.142105f,
-  -0.380607f, -0.059067f, -0.113017f, -0.137448f, -0.177840f, 0.468505f,
-};
-
-static const float av1_4_partition_nn_weights_16_layer1[48 * LABEL_SIZE] = {
-  0.174954f,  -0.239117f, 0.073252f,  0.258881f,  0.579781f,  0.441827f,
-  0.372037f,  -0.062362f, 0.068477f,  0.376811f,  -0.130520f, 0.214951f,
-  -0.200674f, 0.240347f,  0.152954f,  1.360264f,  0.334630f,  -0.064789f,
-  -0.270826f, 0.212699f,  0.045669f,  -0.150852f, -0.412603f, 0.122481f,
-  -0.230246f, 0.005004f,  0.321417f,  -0.554083f, -0.186742f, -0.197687f,
-  -0.028669f, -0.138559f, -0.117773f, 0.024953f,  0.326367f,  -0.109951f,
-  -1.098959f, -0.136134f, 0.563218f,  0.191799f,  0.126191f,  -0.093113f,
-  0.185371f,  0.058468f,  0.245247f,  -0.138064f, -0.471573f, -0.209372f,
-  -0.111171f, 0.222275f,  -0.350556f, -0.106336f, 0.268877f,  0.090639f,
-  -0.083008f, -0.190791f, -0.243922f, -0.121182f, -0.133733f, -0.078450f,
-  0.099751f,  0.353020f,  -0.199079f, -0.463492f, -0.647884f, 0.166611f,
-  -0.464034f, 0.045096f,  -0.312178f, -0.190972f, -0.468297f, 0.662376f,
-  -0.197071f, -0.653123f, -0.354365f, -0.088501f, -0.302671f, 0.140713f,
-  0.885444f,  0.350273f,  -0.003345f, 0.217260f,  0.219156f,  0.240653f,
-  0.347840f,  0.101849f,  -0.244565f, -0.166971f, 0.091056f,  0.319912f,
-  0.268459f,  0.250726f,  -0.155819f, -0.087588f, 0.010749f,  -0.192344f,
-  0.344808f,  0.223482f,  -0.189563f, -0.067317f, -0.348191f, -0.085265f,
-  0.259318f,  0.102408f,  0.096675f,  -0.255564f, -0.168480f, -0.068189f,
-  -0.457704f, 0.010565f,  0.228573f,  -0.124421f, 0.202488f,  0.148519f,
-  0.002180f,  0.099099f,  -0.179019f, 0.245414f,  -0.038307f, 0.116897f,
-  -0.031377f, 0.368533f,  -0.793891f, 0.148614f,  0.075441f,  0.102465f,
-  -0.310002f, -0.355369f, -0.206713f, -0.262276f, 0.068578f,  -0.044980f,
-  0.092689f,  -0.181058f, 0.016279f,  0.155965f,  0.545361f,  -0.390699f,
-  -0.042457f, 0.110238f,  0.114640f,  0.112525f,  0.522221f,  0.533164f,
-  -0.331720f, -0.212966f, 0.140823f,  0.251311f,  -0.006092f, -0.800438f,
-  0.007981f,  -0.585140f, -0.006526f, 0.541683f,  -0.298498f, 0.084322f,
-  -0.056467f, -0.361806f, -0.256347f, -1.419173f, -0.159093f, 0.023017f,
-  0.667915f,  -0.176995f, 0.022307f,  -0.169493f, 0.581377f,  0.044929f,
-  0.044914f,  -0.056290f, 0.324196f,  0.648043f,  -0.089381f, -0.054971f,
-  0.064782f,  0.629356f,  -0.003760f, -0.123822f, 0.144133f,  -0.378821f,
-  1.116858f,  0.128552f,  -0.668783f, 0.207194f,  -0.437781f, -0.283321f,
-  -0.549404f, 0.010538f,  0.208997f,  0.231396f,  -0.174347f, 0.161910f,
+static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 24] = {
+  -2.032866f, 0.056691f,  0.495960f,  0.778785f,  0.548153f,  -0.806942f,
+  0.481155f,  0.282298f,  0.584980f,  0.504688f,  0.209648f,  0.234616f,
+  0.213484f,  0.221969f,  0.205862f,  0.235054f,  0.317863f,  0.257139f,
+  0.529478f,  0.098122f,  -0.657532f, 0.036296f,  0.327728f,  1.323180f,
+  -0.813082f, 0.160216f,  -0.702030f, 0.722733f,  -0.270576f, -0.347416f,
+  -0.264700f, -0.254248f, 0.159820f,  0.087995f,  -0.184163f, 0.117357f,
+  0.074194f,  -0.667369f, 0.498246f,  0.420506f,  0.072409f,  -0.121581f,
+  0.315788f,  0.000525f,  0.414986f,  0.678166f,  -0.011230f, 0.188131f,
+  -0.227749f, 0.009564f,  0.108672f,  0.106923f,  -0.080695f, -0.279382f,
+  -0.061339f, -0.297835f, -0.134707f, 0.145865f,  -0.009655f, -0.000842f,
+  -0.047436f, -0.159149f, -0.320353f, -0.089646f, -0.344765f, 0.313416f,
+  -0.143413f, 0.279668f,  0.000885f,  -0.022380f, -0.140194f, -0.310473f,
+  0.252699f,  0.066204f,  0.477568f,  0.994609f,  -0.276000f, 1.213182f,
+  0.277028f,  -0.411570f, -0.211559f, 0.377815f,  0.121488f,  -0.100559f,
+  -0.317082f, -0.251039f, -0.335181f, -0.154114f, -0.052726f, -0.332558f,
+  -0.143196f, -0.334035f, 0.162305f,  0.142279f,  -0.001210f, -0.135252f,
+  -0.033562f, 0.204307f,  -0.039757f, -0.394174f, 0.126617f,  -0.128648f,
+  -0.410979f, 0.107641f,  -0.117573f, -0.326512f, 0.235166f,  0.084959f,
+  0.290063f,  -0.005838f, 0.459894f,  1.023709f,  -0.196145f, 1.100137f,
+  -0.319815f, -0.308526f, -0.443389f, -0.272769f, -0.035259f, -0.026932f,
+  -0.029743f, 0.125113f,  -0.131024f, -0.321458f, -0.143996f, 0.008714f,
+  -0.101234f, 0.079706f,  -1.128615f, -0.467381f, 0.220563f,  -0.409900f,
+  -0.435353f, 0.759499f,  -0.465799f, -0.394309f, 0.176282f,  -0.086275f,
+  -0.161225f, -0.354814f, 0.562871f,  0.418253f,  0.414361f,  0.445480f,
+  -0.995903f, -0.086632f, -0.230645f, 0.354656f,  -0.317576f, 0.079926f,
+  0.424369f,  0.997232f,  -0.304388f, 1.071667f,  -0.023540f, 0.029677f,
+  0.108564f,  0.183581f,  -0.201395f, -0.054854f, -0.193039f, -0.049899f,
+  -0.271949f, -0.358483f, 0.304930f,  0.023823f,  -0.009319f, -0.214247f,
+  0.100712f,  -0.050162f, 0.327103f,  -0.212999f, -0.030496f, 0.316380f,
+  -0.439589f, -0.249959f, 0.229777f,  -0.353664f, -0.384559f, 0.114236f,
+  0.023119f,  0.007927f,  0.618368f,  0.957759f,  -0.019780f, -1.002389f,
+  0.564277f,  -0.839531f, 1.040445f,  0.054340f,  0.031908f,  -0.032893f,
+  -0.019170f, -0.042011f, 0.568928f,  0.362567f,  -0.559999f, -0.605344f,
+  -0.586146f, -0.290778f, 0.195943f,  -0.109580f, -0.088898f, -0.113054f,
+  0.293282f,  0.429019f,  0.306136f,  0.863025f,  0.021234f,  0.125770f,
+  -0.097108f, -0.072659f, -0.137053f, -0.191631f, 0.106281f,  0.064151f,
+  0.029883f,  0.076287f,  0.757543f,  0.276713f,  -2.529775f, -0.351727f,
+  -1.832316f, 0.544780f,  -0.944529f, 0.509705f,  -0.010236f, -0.016181f,
+  0.021520f,  0.086417f,  0.041312f,  0.296853f,  -0.372378f, 0.354446f,
+  -1.366762f, 0.048875f,  0.464918f,  -0.007450f, 0.750013f,  -0.360261f,
+  0.518532f,  0.753776f,  0.641448f,  0.710746f,  0.250866f,  0.257063f,
+  0.283421f,  0.253585f,  0.170303f,  0.210426f,  0.208842f,  0.158000f,
+  -0.033144f, 0.130748f,  0.907147f,  0.409248f,  -0.854301f, -0.981307f,
+  0.294427f,  -0.507137f, 1.079967f,  0.203203f,  0.383890f,  0.368278f,
+  0.305122f,  0.449288f,  -0.044507f, -0.547263f, -0.298245f, -0.497834f,
+  0.007016f,  -0.101982f, -0.073488f, -0.096111f, -0.479418f, -0.045497f,
+  0.033502f,  -0.018578f, -0.231531f, 0.177949f,  0.099564f,  -0.010233f,
+  -0.333055f, -0.078586f, -0.417867f, 0.171271f,  0.013662f,  -0.143599f,
+  -0.117296f, 0.135382f,  0.048321f,  0.000924f,  -0.055024f, -0.405595f,
+  -0.068260f, -0.271011f, -0.436425f, 0.206751f,  -0.899890f, 0.605510f,
+  0.535649f,  -0.238919f, -0.037619f, -0.213734f, -0.391360f, -0.132344f,
+  0.004660f,  0.176644f,  -1.008475f, -0.038895f, 0.155429f,  -0.095229f,
+  -0.680124f, -0.258063f, -0.261901f, 0.110380f,  -0.337649f, -0.505870f,
+  -1.428536f, 0.610629f,  0.254905f,  0.045098f,  0.044109f,  0.172329f,
+  0.060001f,  -0.234009f, -0.184855f, -0.153028f, -0.140897f, -0.152006f,
+  -0.312134f, 0.081261f,  0.160166f,  0.112690f,  0.266081f,  0.030175f,
+  -0.242746f, 0.000754f,  -0.341811f, -0.149774f, -0.017484f, -0.301342f,
+  -0.121466f, 0.067300f,  0.342176f,  0.474538f,  0.085441f,  -0.263935f,
+  0.479235f,  -0.003713f, -0.784840f, 0.119480f,  0.456632f,  -0.640082f,
+  -0.080575f, -0.744403f, 0.259970f,  0.034667f,  -0.274641f, -0.257594f,
+  -1.121124f, -0.003745f, -0.420693f, 0.300441f,  -0.100976f, -1.049016f,
+  0.201960f,  0.113054f,  0.187010f,  1.237427f,  0.054803f,  -0.028673f,
+  0.003596f,  -0.034724f, 0.117246f,  0.190977f,  0.278915f,  0.224307f,
+  0.017852f,  -0.336233f, -0.372311f, -0.182284f, -0.143510f, 0.331466f,
+  0.045698f,  -0.301095f, 0.184447f,  0.348240f,  -0.017021f, -0.145064f,
+  -0.000221f, -0.382256f, -0.302683f, -0.083927f, -0.008070f, 0.217907f,
+  0.647597f,  -0.050490f, -0.572736f, -0.985748f, -0.289943f, 0.041391f,
+  -0.795464f, -0.186680f, -0.354062f, -0.617400f, -0.282783f, -0.170450f,
+  -0.197197f, -0.146496f, -0.173692f, -0.106277f, -0.071004f, -0.124405f,
+  -0.971412f, 0.038542f,  0.705204f,  0.887113f,  0.150430f,  -0.243676f,
+  0.638410f,  0.320953f,  0.776676f,  0.527584f,  0.070389f,  0.051554f,
+  0.177519f,  0.140451f,  0.128892f,  0.087771f,  0.197660f,  0.194764f,
+};
+
+static const float av1_4_partition_nn_bias_16_layer0[24] = {
+  0.614063f,  -0.384872f, 0.084884f,  -0.023980f, -0.378765f, -0.082312f,
+  -0.458271f, 0.189578f,  -0.046169f, -0.073308f, -0.372322f, 0.162793f,
+  0.148803f,  0.829214f,  -0.221162f, -0.111157f, -0.017484f, -0.280596f,
+  -0.031905f, -0.143459f, 0.078823f,  -0.021940f, 0.026834f,  0.257472f,
+};
+
+static const float av1_4_partition_nn_weights_16_layer1[24 * LABEL_SIZE] = {
+  -0.985391f, 0.587616f,  0.740683f,  0.192066f,  0.447080f,  -0.016585f,
+  0.680449f,  0.028983f,  0.643111f,  0.234338f,  0.107148f,  0.328456f,
+  -0.216394f, 1.106838f,  -0.179062f, -0.129108f, -0.121655f, -0.151340f,
+  -0.306017f, -0.350989f, 0.859284f,  -0.372831f, -0.954419f, 0.250495f,
+  1.046732f,  0.287923f,  -0.421088f, 0.326613f,  -0.314396f, -0.084757f,
+  -0.474228f, 0.687999f,  0.052334f,  0.441708f,  -0.630698f, -0.350348f,
+  -0.602067f, -0.434161f, -0.489824f, -0.313193f, 0.315568f,  0.603119f,
+  0.120245f,  0.182920f,  -1.117797f, -0.239594f, -0.296296f, -0.718093f,
+  0.489497f,  -0.527019f, 0.102453f,  0.426731f,  0.034606f,  0.311461f,
+  -0.012723f, -0.229877f, -0.284290f, 0.383227f,  0.065696f,  -0.222400f,
+  1.279248f,  -0.862190f, 0.629766f,  -0.250011f, -0.325060f, -0.360115f,
+  -0.159540f, -0.291856f, -0.038348f, 0.224639f,  0.600934f,  0.030205f,
+  1.337615f,  -0.286409f, -0.473710f, -0.418995f, -1.035249f, 0.004359f,
+  -0.481860f, 0.563625f,  -0.154709f, -0.101198f, -0.758796f, -0.507616f,
+  -0.095253f, -0.711135f, 0.207759f,  0.076313f,  -0.056087f, -0.162719f,
+  -0.232918f, -0.128402f, -0.444620f, -0.447344f, 1.126012f,  -1.504446f,
 };
 
 static const float av1_4_partition_nn_bias_16_layer1[LABEL_SIZE] = {
-  -0.197883f,
-  -0.136696f,
-  0.094115f,
-  0.612799f,
+  -0.462133f,
+  0.465060f,
+  0.062211f,
+  0.401786f,
 };
 
 static const NN_CONFIG av1_4_partition_nnconfig_16 = {
@@ -1519,7 +1427,7 @@ static const NN_CONFIG av1_4_partition_nnconfig_16 = {
   LABEL_SIZE,    // num_outputs
   1,             // num_hidden_layers
   {
-      48,  // num_hidden_nodes
+      24,  // num_hidden_nodes
   },
   {
       av1_4_partition_nn_weights_16_layer0,
@@ -1532,143 +1440,143 @@ static const NN_CONFIG av1_4_partition_nnconfig_16 = {
 };
 
 static const float av1_4_partition_nn_weights_32_layer0[FEATURE_SIZE * 32] = {
-  0.114554f,  0.043669f,  0.313291f,  0.167688f,  -0.413357f, 0.088232f,
-  0.301915f,  -0.358117f, 0.267711f,  -0.252716f, -0.038531f, -0.032805f,
-  -0.025382f, 0.023624f,  -0.949694f, -0.065480f, -0.375721f, -0.697319f,
-  -0.117387f, -0.204309f, -0.190797f, -0.223867f, -0.190248f, 0.026668f,
-  0.199717f,  0.216902f,  -0.239241f, -0.096894f, -0.225046f, 0.246523f,
-  0.002333f,  -0.254385f, -0.205815f, 0.123139f,  -0.476923f, 0.137557f,
-  0.059686f,  -0.124013f, 0.974675f,  0.889753f,  0.378940f,  0.526413f,
-  -0.208747f, -0.001913f, 0.094081f,  0.848010f,  0.062042f,  0.159831f,
-  0.071016f,  0.024437f,  0.212611f,  0.039501f,  -0.149922f, -0.055229f,
-  -0.229270f, 0.129004f,  -0.182803f, 0.291223f,  -1.197804f, -0.916991f,
-  -0.024095f, 0.738729f,  -0.300326f, 0.402480f,  0.023944f,  -0.022613f,
-  -0.004554f, 0.001784f,  0.035143f,  -0.202237f, 0.080252f,  -0.003912f,
-  -0.040345f, -0.121881f, 0.126672f,  0.093507f,  -0.081305f, -0.081099f,
-  -0.218824f, -0.459254f, -0.055250f, -0.095096f, 0.207278f,  0.245259f,
-  -0.380849f, -0.334458f, -0.351449f, -0.513045f, -0.407823f, -0.222423f,
-  0.103205f,  -0.299965f, -0.211472f, -0.348690f, -0.283688f, -0.152743f,
-  -0.204005f, -0.173636f, 0.020302f,  -0.109112f, 0.081203f,  -0.137344f,
-  -0.364582f, -0.343133f, -0.176167f, -0.446541f, 0.144844f,  -0.268105f,
-  -0.003889f, -0.309560f, -0.236092f, -0.299450f, 0.248269f,  0.207510f,
-  -0.279023f, -0.272472f, -0.166427f, 0.205973f,  -0.345692f, -0.238400f,
-  -0.319178f, -0.327246f, -0.321756f, 0.043191f,  -0.027520f, -0.029310f,
-  0.161379f,  0.031154f,  -0.605365f, -0.230926f, 0.261142f,  -0.262678f,
-  -0.373351f, -0.326245f, 0.279222f,  0.684357f,  -0.864302f, 0.036132f,
-  0.239307f,  0.136262f,  0.124002f,  -0.410379f, -0.172722f, -0.376670f,
-  -0.195889f, 0.037292f,  -0.055295f, 1.022308f,  0.237600f,  -0.618435f,
-  0.366154f,  0.168308f,  -0.473467f, -0.756558f, -0.044830f, 0.019057f,
-  -0.084214f, -0.007789f, -0.066028f, -0.074562f, 0.002082f,  0.001007f,
-  -0.269676f, -0.164768f, -0.027271f, -0.098935f, 0.009431f,  0.254431f,
-  0.124238f,  -0.198181f, 0.142723f,  -0.112997f, -0.164224f, -0.355160f,
-  0.135330f,  -0.379557f, 0.079392f,  0.210607f,  -0.354927f, -0.277678f,
-  -0.931111f, 0.056208f,  -0.347710f, -0.355415f, 0.826145f,  0.390625f,
-  0.374414f,  -0.205685f, 0.562485f,  0.152288f,  0.130635f,  0.056622f,
-  0.057972f,  0.095526f,  -0.082436f, -0.085938f, -0.070570f, -0.087634f,
-  0.335934f,  0.084860f,  0.544424f,  -0.278917f, 0.476740f,  0.050927f,
-  -1.288817f, -0.078320f, -0.553041f, -0.160538f, -0.109365f, -0.127146f,
-  -0.032524f, -0.105117f, -0.182965f, -0.024723f, 0.083317f,  0.060073f,
-  -0.042945f, 0.015249f,  1.241504f,  0.662613f,  0.530496f,  -0.180519f,
-  -1.099086f, -0.825844f, 0.551856f,  -0.025009f, -0.006619f, -0.001049f,
-  0.014828f,  -0.035166f, -0.241091f, -0.136364f, -0.003219f, -0.014581f,
-  -0.379945f, -0.226191f, -0.161241f, -0.496390f, -0.147175f, -0.118004f,
-  -0.128206f, -0.389770f, -0.184288f, -0.119076f, -0.379211f, 0.236180f,
-  -0.468730f, -0.175170f, 0.136433f,  0.167739f,  -0.377602f, 0.135772f,
-  0.040972f,  -0.193974f, -0.319475f, -0.016469f, -0.412027f, -0.322605f,
-  0.111125f,  -0.078456f, -0.387234f, -0.401605f, -0.088717f, -0.340682f,
-  0.010556f,  0.058256f,  -0.127352f, 0.017665f,  0.072632f,  -0.171966f,
-  -0.117342f, -0.166050f, -0.182689f, -0.073182f, 0.096279f,  -0.260229f,
-  0.025216f,  -0.332236f, -0.218706f, -0.200153f, -0.110303f, 0.073499f,
-  -0.280123f, 0.132262f,  -0.308330f, -0.119036f, -0.303874f, -0.065445f,
-  -0.412137f, 0.057167f,  0.044582f,  -0.330952f, -0.232572f, 0.039732f,
-  -0.326877f, -0.300569f, -0.467164f, -0.371499f, 0.034430f,  0.058277f,
-  -0.042485f, -0.409028f, -0.110889f, -0.500758f, -0.343141f, 0.042023f,
-  -1.071050f, 0.086854f,  -0.004932f, -0.259698f, 0.125301f,  -0.742663f,
-  -0.370517f, -0.772840f, 0.193628f,  0.554676f,  0.051283f,  -0.196639f,
-  0.040344f,  0.027391f,  -0.040501f, 0.038303f,  0.032972f,  -0.014638f,
-  0.097720f,  -0.206897f, -0.015480f, 0.008543f,  0.034469f,  0.127234f,
-  -0.396463f, -0.390189f, 0.117538f,  -0.435622f, 0.043420f,  -0.241987f,
-  -0.118254f, -0.190349f, 0.190273f,  -0.085625f, -0.141253f, -0.377438f,
-  -0.249211f, 0.214512f,  -0.363191f, -0.754851f, 0.238045f,  1.127635f,
-  0.173947f,  -0.357620f, 0.073671f,  0.220617f,  0.072067f,  -0.076214f,
-  -0.044583f, -0.018371f, 0.010952f,  -0.135116f, 0.076597f,  0.034480f,
-  -0.070212f, -0.454429f, -0.135215f, 0.163851f,  -0.625990f, -0.283991f,
-  0.284051f,  0.182935f,  -0.048717f, 0.002484f,  -0.009086f, 0.321724f,
-  0.125162f,  -0.069624f, -0.430299f, -0.007224f, -0.284725f, -0.475662f,
-  0.123807f,  -0.313614f, -0.103142f, 0.072125f,  0.100320f,  -0.185558f,
-  -0.481522f, -0.247311f, -0.386762f, -0.258850f, 0.178844f,  -0.381231f,
-  -0.436001f, -0.374834f, 0.230104f,  -0.500679f, 0.170880f,  0.029657f,
-  -0.105857f, -0.366671f, -0.268833f, 0.036885f,  -0.026776f, 0.037837f,
-  -0.362095f, -0.254933f, 0.129650f,  0.007945f,  -0.304715f, -0.100813f,
-  -0.342849f, -0.269223f, 0.178490f,  0.186735f,  -0.353995f, 0.050381f,
-  -0.440186f, 0.025985f,  1.096969f,  1.132937f,  0.581545f,  0.271734f,
-  -0.109169f, -0.014239f, 0.688644f,  0.602702f,  0.048616f,  0.022335f,
-  0.037545f,  0.081667f,  -0.109038f, -0.088565f, -0.002506f, -0.041420f,
-  -0.132515f, 0.187312f,  0.677273f,  1.111182f,  0.199096f,  -0.211551f,
-  -0.896508f, 0.257981f,  0.007803f,  0.160343f,  -0.124864f, -0.097150f,
-  0.225090f,  0.242900f,  -0.195665f, 0.011310f,  0.160765f,  0.169195f,
-  -0.081994f, -0.017372f, -0.566190f, -0.902086f, 0.027768f,  0.511419f,
-  0.076009f,  -0.165861f, 0.240487f,  0.006298f,  -0.153334f, 0.041249f,
-  0.387092f,  0.313011f,  -0.032269f, 0.019024f,  0.052568f,  0.124247f,
-  0.197640f,  0.002537f,  0.651044f,  0.829828f,  -0.446444f, -0.402042f,
-  -0.469399f, -0.019842f, 0.371960f,  0.140373f,  -0.044808f, 0.008283f,
-  0.093791f,  0.052149f,  0.143123f,  -0.449571f, -0.868816f, -0.265661f,
-  -0.225232f, -0.014704f, 0.543836f,  -0.374498f, 0.561647f,  1.309445f,
-  0.056789f,  -0.048447f, 0.255758f,  0.644553f,  -0.124802f, 0.097419f,
-  -0.149336f, 0.021596f,  -0.043699f, 0.057591f,  -0.000077f, 0.034488f,
-  -0.049353f, -0.007799f, 0.437914f,  0.509369f,  0.674428f,  1.858949f,
-  -0.205964f, 0.060776f,  0.184213f,  0.037177f,  -0.062535f, -0.115408f,
-  0.076498f,  0.010235f,  -0.142253f, 0.009983f,  0.073436f,  0.038716f,
-  -0.369983f, -0.185959f, -0.137867f, 0.032134f,  0.213814f,  -0.125571f,
-  0.247874f,  -0.166871f, -0.160890f, 0.147029f,  0.267143f,  -0.298488f,
-  -0.210203f, -0.188313f, -0.085024f, -0.244962f, -0.189833f, -0.261242f,
-  0.399519f,  0.143200f,  -0.776419f, -0.374639f, -0.022066f, 0.582904f,
-  0.006430f,  -0.139134f, -0.491894f, -0.430579f, -0.358221f, -0.231365f,
-  -0.398255f, -0.173231f, 0.211789f,  -0.036121f, -0.266856f, 0.042956f,
-  -1.138513f, -0.070313f, 0.158803f,  0.406989f,  -0.015974f, 0.651020f,
-  -0.468982f, -0.310019f, 0.416922f,  0.895162f,  0.019921f,  0.004023f,
-  0.006962f,  0.000863f,  -0.216395f, -0.074913f, -0.002613f, 0.026703f,
+  -0.219494f, -0.428273f, 0.471006f,  0.448210f,  -0.152935f, 0.440435f,
+  0.922857f,  -0.074436f, 1.002195f,  0.414176f,  -0.327202f, -0.380066f,
+  -0.212346f, 0.061868f,  -0.056620f, 0.594134f,  0.617995f,  0.308358f,
+  0.232484f,  0.129849f,  1.483593f,  -0.071460f, 1.984515f,  1.116422f,
+  -1.141762f, -0.306220f, 0.089075f,  -0.271845f, 0.187524f,  0.050396f,
+  -0.061025f, 0.030809f,  0.172799f,  -0.458151f, -0.318357f, 0.122052f,
+  -0.414329f, 0.089366f,  0.118898f,  -0.376213f, -0.206151f, -0.519946f,
+  -0.463252f, -0.206694f, -0.254383f, -0.379487f, 0.093059f,  -0.245280f,
+  -0.205044f, -0.280060f, -0.171229f, -0.045389f, -0.179481f, -0.306245f,
+  -0.500856f, 0.003388f,  -0.527397f, -0.449330f, -0.174272f, 0.123769f,
+  0.023005f,  0.157273f,  0.073400f,  0.019099f,  -0.113848f, -0.098601f,
+  -0.290946f, -0.046770f, -0.314592f, -0.179914f, -0.391411f, -0.235631f,
+  -1.282604f, 0.048505f,  -0.746382f, 0.093740f,  -0.706583f, -0.085729f,
+  0.947382f,  -0.002961f, 1.175362f,  1.007309f,  0.141638f,  -0.037608f,
+  -0.118807f, -0.021474f, -0.146763f, 0.069363f,  -0.074372f, -0.215713f,
+  -0.004134f, -0.114110f, -0.330438f, -0.031136f, 0.111821f,  -0.534598f,
+  -0.357759f, -0.455950f, 0.139469f,  0.036582f,  -0.384743f, -0.168828f,
+  -0.239250f, 0.003520f,  -0.049003f, 0.075702f,  -0.025809f, -0.225972f,
+  -0.228905f, -0.412489f, 0.060570f,  -0.328819f, -0.206446f, -0.080231f,
+  -0.372008f, -0.218118f, -0.011954f, 0.024155f,  0.156014f,  0.020679f,
+  0.194398f,  -0.283491f, -0.024463f, -0.275099f, 0.028031f,  0.026340f,
+  -0.254668f, 0.103637f,  2.178693f,  0.552284f,  0.109366f,  -0.474806f,
+  -0.379286f, -0.026315f, 2.487924f,  -0.089466f, 0.206428f,  0.114578f,
+  0.152248f,  0.184050f,  -0.631948f, -0.014793f, -0.283782f, -0.830353f,
+  0.009343f,  -0.021029f, -0.060534f, -0.025164f, 1.841311f,  1.842748f,
+  -1.979708f, 0.450985f,  -1.606357f, -0.785454f, -0.212679f, -0.344342f,
+  0.198991f,  -0.258070f, 0.055974f,  0.224069f,  0.453051f,  0.408053f,
+  0.027873f,  -0.180538f, 0.056609f,  0.207654f,  0.104086f,  -0.194426f,
+  -0.359789f, -0.381143f, -0.331212f, -0.203973f, -0.324313f, -0.160825f,
+  -0.160439f, -0.044856f, -0.346647f, 0.044859f,  0.231398f,  -0.023643f,
+  -0.140316f, -0.260177f, 0.206965f,  -0.425386f, -0.420268f, -0.409748f,
+  0.006971f,  0.066186f,  -0.034950f, -0.345518f, 0.018633f,  -0.122489f,
+  -0.038506f, -0.330942f, 0.161236f,  -0.314119f, -0.050202f, -0.179597f,
+  0.731897f,  -0.184481f, 0.153598f,  -0.539501f, -0.301493f, -0.184967f,
+  -0.883754f, -0.586959f, -0.136292f, -1.772065f, -0.196276f, -0.053272f,
+  -0.101083f, -0.064142f, 0.161190f,  0.430826f,  0.355647f,  0.138266f,
+  0.051114f,  -0.028893f, -0.477673f, -0.238663f, -0.354117f, -0.056747f,
+  -0.334273f, -0.497688f, -0.486004f, -0.092033f, -0.241304f, -0.373250f,
+  0.120193f,  0.011360f,  -0.010475f, -0.092739f, -0.159650f, -0.033129f,
+  -0.259893f, -0.073217f, 0.200128f,  0.103407f,  -0.229233f, 0.128831f,
+  -0.063450f, -0.241732f, -0.408428f, -0.342239f, -0.264326f, -0.105403f,
+  -0.442879f, -0.310456f, -0.112881f, 0.263696f,  -0.205014f, -0.497936f,
+  -0.261734f, -0.382312f, -0.426807f, -0.021995f, -0.152794f, -0.301494f,
+  0.117232f,  -0.577809f, 0.154596f,  -0.409522f, -0.413113f, -0.359199f,
+  0.307294f,  -0.008746f, -0.310522f, 0.347620f,  -0.384845f, -0.451398f,
+  -0.226199f, 0.054154f,  -0.167608f, 0.046836f,  -0.013285f, -0.408119f,
+  -0.177973f, -0.248293f, -0.465830f, 0.035827f,  -0.222208f, -0.221717f,
+  0.066392f,  -0.349769f, -0.428029f, -0.516692f, 0.022398f,  -0.251682f,
+  0.134746f,  0.011167f,  -2.078787f, 0.173592f,  -1.948348f, 0.330060f,
+  1.993785f,  -0.052859f, -0.004795f, -3.703177f, 0.013450f,  -0.011687f,
+  0.073079f,  0.034803f,  0.025515f,  0.005994f,  0.101731f,  0.074303f,
+  -0.109962f, -0.270825f, -0.068273f, -0.163268f, -0.252826f, 0.137190f,
+  0.007667f,  -0.358453f, 0.027412f,  0.033492f,  0.021197f,  -0.049991f,
+  0.104468f,  -0.012157f, -0.056252f, -0.380756f, -0.338483f, 0.233235f,
+  -0.048631f, -0.441209f, -0.158482f, -0.148108f, -0.263453f, 0.138847f,
+  -0.304073f, -0.336312f, -0.017941f, -0.135563f, 0.075137f,  -0.246475f,
+  -0.229144f, -0.087744f, -0.346909f, 0.172611f,  0.004377f,  -0.009386f,
+  -0.023104f, 0.008000f,  -0.029390f, -0.317842f, 0.549674f,  -0.195337f,
+  -0.863979f, 0.160889f,  -0.269014f, -0.442104f, -1.799191f, 1.396533f,
+  -0.112837f, 0.881303f,  0.000764f,  -0.035415f, -0.141877f, 0.184831f,
+  -0.363566f, -0.178569f, 0.254134f,  -0.326893f, 0.127325f,  0.310620f,
+  -0.384621f, 0.146058f,  -0.287682f, -0.373447f, 0.026930f,  0.251650f,
+  0.053817f,  0.227509f,  0.121396f,  0.396514f,  -0.278381f, -0.038969f,
+  -1.538756f, -0.002856f, -0.892900f, 0.363426f,  -1.257922f, 0.743795f,
+  0.941177f,  0.219345f,  0.684189f,  1.396858f,  0.026299f,  -0.093433f,
+  -0.066182f, 0.057868f,  -0.089278f, -0.159680f, -0.262035f, -0.236656f,
+  0.005349f,  -0.031314f, 0.027917f,  -0.182113f, -0.212086f, -0.160774f,
+  0.051468f,  0.036787f,  0.183881f,  -0.288205f, -0.349691f, 0.162511f,
+  0.117878f,  -0.294534f, -0.365037f, -0.246313f, 0.073977f,  -0.072378f,
+  -0.173579f, -0.584560f, 0.547194f,  0.259853f,  -0.405287f, -0.421146f,
+  0.165788f,  -0.146964f, 0.257415f,  0.772394f,  -0.475302f, -0.310906f,
+  0.058723f,  0.276833f,  0.586842f,  0.248998f,  -0.061135f, 0.255779f,
+  0.152158f,  -0.024781f, 2.821834f,  1.365141f,  0.914744f,  0.165752f,
+  -1.048304f, -0.333891f, 1.804087f,  -0.437028f, -0.120211f, -0.020443f,
+  0.040077f,  0.258600f,  -0.598893f, -0.494579f, -0.281054f, -0.517041f,
+  0.005258f,  0.053986f,  0.322755f,  0.429495f,  -1.992364f, -0.717192f,
+  -1.774802f, 2.047362f,  -0.016194f, 0.312606f,  0.019331f,  0.060950f,
+  0.116428f,  0.168458f,  -0.307001f, -0.420734f, 0.475843f,  0.425346f,
+  -0.107119f, 0.049892f,  -1.168619f, 0.010878f,  0.354872f,  0.902717f,
+  -0.391407f, 0.332772f,  -1.335037f, -0.447100f, 0.481719f,  -0.101069f,
+  -1.806565f, 0.925280f,  0.346999f,  0.093809f,  0.006275f,  0.270814f,
+  -0.691123f, 0.230748f,  0.137033f,  0.068228f,  1.555975f,  -0.271637f,
+  -0.370403f, 0.236131f,  0.367464f,  -0.136562f, 0.428838f,  0.181750f,
+  0.338762f,  0.292449f,  -0.748204f, -0.922731f, -0.959445f, -0.806418f,
+  -0.140501f, 0.070525f,  1.248748f,  0.637990f,  -1.307246f, -0.514055f,
+  0.393858f,  -1.858727f, 0.713591f,  -0.141044f, 0.080723f,  0.120220f,
+  -0.031175f, 0.224488f,  0.753818f,  -0.833351f, -1.099132f, 0.651100f,
+  -0.135061f, -0.043820f, 0.026983f,  -0.059259f, 0.001345f,  -0.281775f,
+  0.006958f,  0.046103f,  -0.246539f, 0.057630f,  -0.360778f, -0.160681f,
+  -0.414870f, -0.301979f, 0.000683f,  0.132957f,  -0.477609f, 0.106110f,
+  -0.637769f, -0.078374f, -0.229494f, 0.583108f,  -0.822973f, -0.107540f,
+  1.063426f,  -0.268346f, 1.105787f,  2.587550f,  -0.020314f, -0.002161f,
+  -0.063836f, -0.099990f, -0.103975f, -0.114078f, -0.094199f, -0.065181f,
+  -0.019870f, -0.018920f, -0.219732f, 0.035608f,  -1.789450f, 0.483032f,
+  -0.464729f, 1.563277f,  -1.054195f, 0.359991f,  0.065204f,  0.135623f,
+  0.158380f,  -0.103815f, -1.398726f, -1.436666f, -0.356311f, 0.507752f,
 };
 
 static const float av1_4_partition_nn_bias_32_layer0[32] = {
-  0.133615f,  -0.113389f, -0.575989f, 0.589389f,  -0.193574f, -0.132463f,
-  0.000000f,  0.060317f,  0.264577f,  -0.060599f, 0.540147f,  -0.127782f,
-  -0.548802f, -0.172235f, -0.193032f, -0.026301f, -0.177527f, 0.267821f,
-  -0.115455f, -0.137162f, -0.079595f, -0.041443f, -0.043856f, -0.657220f,
-  -0.448931f, 0.446300f,  0.250002f,  0.223559f,  -0.647723f, -0.014369f,
-  0.084333f,  -0.056270f,
+  0.421645f,  -0.620548f, -0.187819f, -0.189414f, -0.204975f, -0.189600f,
+  -0.174917f, -0.651928f, -0.799655f, -0.086105f, -0.163449f, -0.089212f,
+  -0.214495f, -0.108500f, -0.065777f, -0.127704f, 1.544948f,  -0.032831f,
+  -0.165621f, 0.145844f,  -0.032104f, -0.453246f, -0.113444f, 0.321589f,
+  -0.862375f, -0.108826f, -0.486259f, 0.685325f,  0.072569f,  -0.187961f,
+  0.109579f,  -0.082685f,
 };
 
 static const float av1_4_partition_nn_weights_32_layer1[32 * LABEL_SIZE] = {
-  -0.069633f, -0.087239f, 0.365816f,  -0.068579f, 0.231198f,  -0.067856f,
-  -0.139892f, -0.100235f, -0.488166f, -0.150112f, -0.005546f, 0.210832f,
-  0.778888f,  0.169624f,  0.089968f,  -0.243569f, 0.353483f,  0.032296f,
-  -0.157408f, 0.286885f,  -0.063537f, -0.324055f, -0.161464f, 0.430600f,
-  0.277707f,  -0.196463f, 0.154647f,  0.059804f,  0.176408f,  0.303179f,
-  -0.040156f, 0.375810f,  -0.363032f, -0.186808f, -0.264561f, -0.158937f,
-  -0.007949f, -0.076394f, 0.056475f,  0.308528f,  0.695387f,  0.051336f,
-  0.433063f,  -0.229948f, -1.210712f, 0.036286f,  0.183868f,  -0.117660f,
-  0.230134f,  -0.093469f, 0.237918f,  0.625986f,  -0.236671f, -0.377172f,
-  0.331091f,  -0.394004f, -0.214349f, 0.243940f,  -0.600348f, 0.069843f,
-  0.088325f,  0.225775f,  0.276884f,  -0.604493f, 0.769812f,  0.259574f,
-  0.086220f,  0.511515f,  -0.282584f, -0.157719f, 0.278778f,  -0.332732f,
-  0.068985f,  -0.237236f, -0.006102f, -0.154883f, 0.710288f,  -0.245896f,
-  -0.255895f, -0.398038f, 0.304084f,  -0.317065f, 0.192609f,  -0.235613f,
-  0.461340f,  0.117194f,  0.116817f,  0.196150f,  0.421622f,  -0.264495f,
-  0.617852f,  -0.351756f, -0.310016f, 0.135932f,  -0.242622f, -0.073094f,
-  0.042077f,  0.039230f,  -0.482715f, 0.553187f,  0.360637f,  0.313484f,
-  -0.131540f, -0.104731f, 0.374704f,  0.222173f,  0.437657f,  0.029827f,
-  -0.545156f, -0.203176f, 0.267824f,  0.169237f,  -0.057871f, 0.552197f,
-  0.272243f,  0.025681f,  -0.262192f, 0.255934f,  -0.202407f, -0.483317f,
-  -0.204721f, 0.288807f,  -0.030735f, -0.047161f, -0.780724f, 0.381939f,
-  -0.295318f, 0.537378f,
+  0.255012f,  0.658860f,  0.216907f,  0.165947f,  0.241182f,  0.340854f,
+  0.409445f,  0.165220f,  0.553373f,  -0.242385f, -0.209571f, 0.255515f,
+  0.222500f,  0.037032f,  0.238590f,  0.061624f,  -2.038693f, 0.264167f,
+  -0.230144f, 0.129952f,  -0.027979f, 0.847761f,  0.438922f,  0.462323f,
+  0.555345f,  0.030689f,  0.336357f,  -0.357326f, -0.113137f, 0.272631f,
+  0.421022f,  0.367776f,  -0.197094f, 0.157117f,  -0.015008f, -0.056123f,
+  -0.283913f, 0.186417f,  0.178561f,  -0.763041f, 0.602038f,  0.341092f,
+  0.320453f,  -0.312776f, -0.371240f, -0.356279f, 0.220117f,  -0.131871f,
+  1.517429f,  0.162223f,  -0.255069f, 0.451861f,  0.045071f,  -0.223257f,
+  0.003257f,  0.015734f,  -0.630447f, -0.672588f, 0.670164f,  0.571031f,
+  -0.657948f, 0.034506f,  -0.249076f, 0.790293f,  0.066491f,  -0.131245f,
+  0.355173f,  0.564622f,  0.374048f,  0.033974f,  0.253970f,  0.495498f,
+  -0.556321f, -0.104651f, 0.276947f,  0.057148f,  -0.039126f, -0.170050f,
+  -0.141542f, 0.158541f,  0.582763f,  -0.100992f, 0.096705f,  -0.209029f,
+  0.008449f,  0.255865f,  0.103565f,  0.317719f,  0.479499f,  0.599126f,
+  -0.065613f, -0.268614f, 0.508736f,  0.180813f,  -0.815868f, 0.051238f,
+  0.001223f,  -0.305423f, -0.270079f, 0.036180f,  0.304342f,  0.202634f,
+  0.218348f,  -0.304304f, -0.438297f, 0.241123f,  0.200230f,  0.151804f,
+  0.051944f,  0.160422f,  -0.262981f, -0.417412f, 1.845729f,  -0.086183f,
+  0.403517f,  0.059667f,  0.564543f,  -0.081752f, 0.114907f,  -0.284489f,
+  -0.673943f, 0.056965f,  0.362221f,  0.403224f,  -0.000233f, -0.209552f,
+  -0.800926f, -0.134132f,
 };
 
 static const float av1_4_partition_nn_bias_32_layer1[LABEL_SIZE] = {
-  -0.332518f,
-  0.114452f,
-  0.098949f,
-  0.465896f,
+  -0.019518f,
+  0.198546f,
+  0.339015f,
+  -0.261961f,
 };
 
 static const NN_CONFIG av1_4_partition_nnconfig_32 = {
@@ -1688,82 +1596,112 @@ static const NN_CONFIG av1_4_partition_nnconfig_32 = {
   },
 };
 
-static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 16] = {
-  0.256343f,  -0.021774f, -0.117102f, 0.416930f,  0.188160f,  0.148768f,
-  -0.611181f, -0.121607f, -0.394825f, -0.875025f, -0.167071f, 0.016408f,
-  0.222769f,  -0.199332f, 0.058667f,  -0.679529f, 0.081744f,  0.044438f,
-  -0.182941f, -0.110339f, -0.137822f, -0.096164f, -0.132319f, 0.140036f,
-  -0.049503f, -0.309894f, -0.323991f, 0.166113f,  0.138104f,  -0.263629f,
-  -0.368460f, -0.273989f, 0.147239f,  0.044566f,  -0.363357f, -0.030792f,
-  0.020734f,  0.068506f,  -0.434214f, 0.581644f,  -1.244146f, -0.569162f,
-  0.179499f,  -0.188900f, 0.078431f,  -0.392126f, -0.006431f, 0.112146f,
-  -0.065892f, -0.051319f, 0.094607f,  0.251700f,  -0.000650f, 0.011911f,
-  0.080449f,  0.022816f,  0.322382f,  0.577070f,  0.927738f,  0.178707f,
-  -0.101237f, -0.212521f, 0.560261f,  -0.206492f, -0.077591f, -0.069960f,
-  0.025727f,  0.041122f,  -0.735228f, -0.506091f, -0.600776f, -0.117829f,
-  0.103556f,  0.141823f,  0.853448f,  0.339488f,  0.994022f,  0.121693f,
-  -2.065366f, -0.352510f, -0.174323f, -0.323400f, -0.002193f, 0.004161f,
-  0.042469f,  -0.005319f, -0.305784f, -0.371353f, 0.011194f,  -0.018597f,
-  0.209260f,  0.071577f,  0.242470f,  -0.856593f, 0.288842f,  1.062608f,
-  -0.300472f, 0.221623f,  -0.813563f, -0.250347f, -0.081455f, -0.092779f,
-  -0.168132f, -0.180640f, -0.075130f, -0.052906f, -0.015645f, 0.127158f,
-  -0.006546f, 0.051671f,  0.545608f,  1.101804f,  0.288086f,  1.107046f,
-  -0.200012f, 0.220182f,  -0.189220f, -0.554973f, 0.040711f,  -0.058029f,
-  0.043737f,  0.016164f,  -0.391790f, -0.287770f, -0.046545f, 0.045071f,
-  0.190005f,  -0.076963f, 0.836839f,  1.633266f,  0.902928f,  0.991972f,
-  -0.127932f, 0.293680f,  -0.035984f, 0.476179f,  -0.098024f, 0.068314f,
-  -0.058365f, 0.096221f,  -0.000321f, -0.128840f, 0.136441f,  -0.061853f,
-  0.270367f,  -0.184129f, -0.373670f, -0.177381f, 0.262109f,  -0.378013f,
-  -0.053249f, -0.456389f, 0.222972f,  -0.228067f, -0.115210f, -0.277797f,
-  0.096913f,  -0.014512f, -0.015533f, 0.026389f,  -0.360536f, -0.078477f,
-  -0.203186f, 0.199574f,  0.770476f,  0.595592f,  0.360828f,  0.547721f,
-  -0.804787f, 0.389690f,  -0.437645f, 0.576776f,  0.081903f,  0.082750f,
-  0.007166f,  -0.143755f, 0.114462f,  0.472432f,  -0.058974f, 0.077761f,
-  -2.015181f, -0.054942f, -0.110894f, 0.529188f,  -0.003300f, 0.913895f,
-  -0.324643f, 0.316135f,  -0.291729f, 1.072647f,  -0.029236f, 0.045592f,
-  -0.039399f, 0.043472f,  -0.303244f, -0.108761f, -0.011154f, 0.009693f,
-  -0.374985f, 0.027758f,  0.302075f,  -0.295758f, -0.165563f, -0.297259f,
-  -0.485624f, -0.469310f, -0.028247f, -0.124440f, -0.428082f, 0.096325f,
-  0.089003f,  -0.301585f, 0.022474f,  0.077477f,  -0.032233f, -0.231036f,
-  0.143206f,  0.169113f,  -0.556486f, 0.346327f,  -0.667790f, 0.126983f,
-  0.179727f,  0.397307f,  -0.490612f, -1.708789f, -0.040336f, -0.028547f,
-  -0.091319f, -0.119367f, -0.518796f, -0.543383f, 0.037162f,  0.031344f,
-  -0.131692f, 0.119353f,  0.799313f,  0.443848f,  -0.499919f, -1.002983f,
-  0.375477f,  0.221096f,  -0.238033f, 0.284849f,  0.021897f,  0.023338f,
-  -0.059067f, 0.117276f,  0.039540f,  0.049630f,  0.175150f,  0.014166f,
-  -0.071486f, 0.091234f,  -1.007432f, -1.417378f, 0.640528f,  1.442576f,
-  -0.257183f, -0.597016f, 0.861785f,  0.276121f,  -0.098017f, 0.120514f,
-  -0.133184f, 0.106529f,  0.171644f,  0.059513f,  0.215952f,  -0.009441f,
-  -0.505313f, 0.063174f,  0.229148f,  -0.344213f, 0.862721f,  1.549941f,
-  -0.220129f, 0.493094f,  0.264095f,  0.143641f,  0.084968f,  -0.078266f,
-  0.032335f,  -0.019006f, -0.098205f, 0.119213f,  -0.103465f, 0.072811f,
-};
-
-static const float av1_4_partition_nn_bias_64_layer0[16] = {
-  0.111611f,  -0.067682f, 0.633594f,  0.143559f,  -1.051284f, -0.266625f,
-  -0.829789f, -0.956123f, -0.153484f, -0.787741f, 0.004832f,  -0.080769f,
-  0.235166f,  0.449468f,  0.294689f,  -0.395300f,
-};
-
-static const float av1_4_partition_nn_weights_64_layer1[16 * LABEL_SIZE] = {
-  -0.069999f, -0.093710f, -0.423714f, -0.028138f, 0.684415f,  0.141445f,
-  0.507161f,  0.435533f,  -0.263268f, 0.585105f,  0.235301f,  0.127536f,
-  -0.688639f, -0.217993f, -0.540066f, 0.406718f,  0.018210f,  -0.077349f,
-  -0.124823f, -0.488220f, -0.957026f, 0.302632f,  0.285490f,  -0.411356f,
-  0.091089f,  0.103862f,  -0.549291f, 0.148628f,  0.640603f,  -0.601018f,
-  0.178024f,  0.601370f,  0.313780f,  0.051938f,  0.524083f,  0.814631f,
-  -0.415522f, -0.738849f, 0.477881f,  -0.342864f, 0.105181f,  0.040010f,
-  -0.177521f, 0.400646f,  0.167093f,  0.388279f,  -0.898439f, -0.111936f,
-  0.469875f,  -0.099528f, -0.217370f, 0.283742f,  -0.033798f, -0.142797f,
-  -0.174057f, -1.293311f, -0.038777f, -0.003846f, 0.093642f,  -0.527150f,
-  -0.021259f, 0.194651f,  -0.276294f, -0.109514f,
+static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 24] = {
+  -0.152649f, 0.074509f,  1.000136f,  0.601661f,  -1.416694f, -1.932396f,
+  -1.163850f, 0.640931f,  -0.888625f, -0.345711f, 0.161799f,  0.103165f,
+  0.147513f,  0.089956f,  0.204329f,  0.196922f,  0.014927f,  0.283714f,
+  -0.110422f, 0.062005f,  -0.531870f, -0.075287f, -0.448349f, -0.218881f,
+  -0.005592f, -0.130490f, -0.015779f, 0.093521f,  -0.158487f, 0.072241f,
+  0.066879f,  -0.418566f, -0.206281f, 0.025634f,  0.048334f,  -0.534750f,
+  0.302081f,  0.028707f,  -1.543248f, 0.103799f,  -1.214052f, 0.395870f,
+  0.394754f,  -0.272170f, -0.702953f, -4.057464f, -0.033497f, -0.042142f,
+  0.014742f,  0.065263f,  0.000879f,  -0.019768f, 0.101275f,  0.163059f,
+  -0.371392f, -0.283484f, 0.241915f,  0.012684f,  -0.210101f, -0.166534f,
+  -0.024894f, 0.274696f,  0.098993f,  0.104086f,  0.055044f,  -0.289378f,
+  0.146571f,  -0.147441f, 0.004056f,  0.112244f,  -0.416162f, -0.033176f,
+  -0.214836f, -0.213787f, 0.023197f,  -0.339043f, 0.301109f,  -0.408551f,
+  0.284922f,  -0.344418f, -0.039255f, 0.158748f,  -0.344169f, 0.078286f,
+  -0.043957f, -0.302162f, -0.310826f, 0.063425f,  0.198166f,  -0.285324f,
+  -0.108252f, 0.038992f,  -1.053110f, -1.663290f, -0.417185f, 1.504443f,
+  0.643206f,  -0.850240f, 0.889641f,  -0.733214f, 0.147302f,  0.060291f,
+  -0.052954f, 0.167453f,  0.111870f,  0.085471f,  0.035107f,  0.064361f,
+  0.176053f,  0.184373f,  0.676576f,  0.066164f,  1.455569f,  0.925111f,
+  -0.640845f, 0.803795f,  -0.653782f, -0.201038f, 0.060033f,  0.016964f,
+  -0.047590f, 0.045908f,  0.354162f,  0.014812f,  0.156978f,  0.058792f,
+  -0.238119f, 0.002450f,  -0.094388f, -0.155229f, 0.194858f,  -0.355429f,
+  -0.187098f, -0.119264f, -0.088694f, -0.102845f, 0.184905f,  -0.425339f,
+  -0.157808f, -0.104599f, -0.393248f, -0.379842f, 0.027741f,  -0.185816f,
+  -0.317294f, 0.002453f,  -0.498241f, -0.204302f, -0.079093f, 0.020646f,
+  -0.412850f, -0.426039f, -0.177050f, -0.419304f, -0.064478f, -0.191802f,
+  -0.146812f, 0.171111f,  0.090261f,  -0.367033f, -0.299051f, -0.322132f,
+  0.428192f,  -0.252613f, 0.488498f,  -0.559682f, 0.486720f,  -0.511084f,
+  0.992506f,  0.346765f,  -0.118697f, -0.065127f, -0.376612f, -0.345137f,
+  -0.426517f, -0.516836f, 0.307083f,  0.609362f,  0.369555f,  0.093775f,
+  -0.375664f, -0.221595f, -0.025465f, 0.134374f,  -0.387031f, 0.096236f,
+  0.337465f,  -0.124029f, -0.157340f, -0.368790f, -0.104490f, -0.279507f,
+  -0.247705f, 0.146559f,  -0.236206f, -0.036073f, 0.064206f,  -0.330919f,
+  0.516591f,  -0.013492f, 1.269568f,  1.182530f,  -0.455390f, -1.328091f,
+  -0.200950f, -0.380513f, -0.195532f, -0.341479f, 0.016064f,  0.021176f,
+  0.169119f,  0.103707f,  -0.174504f, -0.462719f, -0.079445f, -0.247128f,
+  0.459111f,  0.036129f,  0.769570f,  -0.080405f, 1.667107f,  0.355567f,
+  -2.433896f, 0.627572f,  -0.600090f, -0.651872f, -0.059769f, -0.041945f,
+  -0.009933f, 0.014864f,  -0.049378f, -0.041561f, 0.075180f,  0.138307f,
+  0.122366f,  -0.160756f, 0.215327f,  0.013572f,  0.198194f,  -0.762650f,
+  0.054466f,  1.110332f,  1.692853f,  0.658654f,  -0.409549f, 0.506085f,
+  0.330962f,  -0.223008f, 0.007448f,  -0.289062f, -0.476231f, -0.228359f,
+  0.013977f,  -0.000609f, -0.673604f, 0.275996f,  0.405291f,  1.693561f,
+  -1.079768f, 1.122516f,  -0.203227f, 0.099265f,  -0.165207f, -0.323899f,
+  -0.269973f, -0.080122f, 0.127700f,  0.190201f,  0.219527f,  0.306194f,
+  0.026049f,  -0.003779f, 1.107357f,  1.720315f,  1.017908f,  0.078664f,
+  -1.599813f, -0.482636f, -0.117450f, 0.122249f,  0.030220f,  0.039794f,
+  0.176350f,  0.129715f,  -0.305755f, -0.274044f, -0.299640f, -0.187335f,
+  -0.073616f, -0.564507f, -0.127758f, 0.044855f,  -0.191090f, 0.039095f,
+  0.115378f,  0.969352f,  -0.088360f, 0.301443f,  0.065726f,  -0.019740f,
+  -0.102350f, -0.084913f, -0.194615f, 0.118582f,  0.920789f,  -0.171615f,
+  -1.436553f, -0.026419f, -0.730864f, 0.615697f,  -0.795079f, 0.119701f,
+  0.601782f,  0.792902f,  0.184920f,  1.635090f,  -0.085860f, -0.033187f,
+  -0.166883f, 0.008487f,  -0.128300f, -0.089923f, -0.108781f, -0.133719f,
+  -0.011988f, -0.239816f, -0.092563f, -0.238471f, -0.339722f, 0.177432f,
+  -0.063101f, -0.121002f, 0.058072f,  -0.031166f, 0.086413f,  -0.016203f,
+  -0.305075f, -0.005420f, -0.168796f, 0.148745f,  -0.116737f, -0.050222f,
+  -0.287952f, -0.290982f, -0.090449f, 0.076098f,  -0.345632f, -0.061309f,
+  0.142218f,  0.035692f,  0.304517f,  -0.228031f, 0.119608f,  -0.120350f,
+  0.163404f,  -0.105605f, -0.305462f, -0.176657f, 0.210070f,  -0.227600f,
+  -0.081965f, -0.464027f, -0.053782f, -0.018367f, 0.119159f,  0.017162f,
+  -0.069792f, 0.305768f,  -0.421095f, 0.187740f,  -0.032059f, 0.575115f,
+  -0.064283f, -0.091828f, 0.772648f,  -0.393189f, -0.297098f, 0.141420f,
+  0.826389f,  -0.071586f, -0.893968f, -0.346793f, -1.151655f, 0.039393f,
+  1.546000f,  -0.094029f, -0.005786f, -0.195764f, -0.169724f, -0.133167f,
+  -0.129312f, -0.418860f, -0.026553f, -0.053667f, -0.091976f, -0.106275f,
+  -0.492625f, 0.025350f,  -0.332075f, -0.475638f, -0.076667f, -0.065779f,
+  0.108957f,  0.246298f,  -0.289007f, -0.442552f, -0.206692f, -0.257453f,
+  0.073806f,  -0.458606f, -0.410390f, -0.312674f, -0.144813f, 0.170128f,
+  0.018810f,  -0.098241f, 1.027369f,  0.479328f,  1.129707f,  0.484813f,
+  -0.085207f, 0.621873f,  -0.520981f, 0.236175f,  0.273487f,  0.061426f,
+  0.306085f,  0.161487f,  0.220991f,  0.223783f,  -0.091826f, 0.391031f,
+};
+
+static const float av1_4_partition_nn_bias_64_layer0[24] = {
+  0.580225f,  -0.191304f, 1.091767f,  -0.134522f, -0.089361f, 0.398750f,
+  -0.882708f, -0.213102f, -0.119981f, 0.378296f,  -0.075719f, 0.426598f,
+  -2.015505f, 0.202534f,  -1.044792f, -0.841519f, 0.266421f,  -0.047115f,
+  -0.131147f, -0.075066f, -0.009441f, 0.853007f,  -0.175606f, -0.868306f,
+};
+
+static const float av1_4_partition_nn_weights_64_layer1[24 * LABEL_SIZE] = {
+  -0.851937f, -0.211148f, -2.289513f, -0.275071f, 0.251340f,  -0.340847f,
+  0.498032f,  0.308652f,  -0.051574f, 0.323146f,  -0.097547f, -0.040269f,
+  1.909655f,  0.098348f,  0.588136f,  0.568112f,  0.313297f,  0.920848f,
+  -0.014486f, 0.386014f,  0.029199f,  -0.537330f, -0.021502f, 0.349073f,
+  -0.524715f, -0.351848f, 1.565454f,  -0.297148f, 0.020177f,  0.648369f,
+  0.027321f,  -0.096052f, -0.363163f, -0.132642f, 0.024292f,  -0.734176f,
+  -0.782700f, 0.408299f,  0.476945f,  -0.489512f, -0.728318f, -0.632042f,
+  0.405417f,  0.184086f,  -0.400730f, 0.359032f,  0.019710f,  -0.217409f,
+  0.519159f,  -0.136316f, 0.993592f,  -0.147128f, 0.097495f,  0.426189f,
+  -0.295233f, 0.278799f,  0.080667f,  -0.025052f, -0.307757f, 0.418716f,
+  -0.853388f, -0.374878f, -0.322725f, 0.696335f,  -0.380649f, -0.160356f,
+  -0.140060f, 0.502455f,  0.656728f,  -0.095023f, -0.184198f, -0.347069f,
+  0.456372f,  -0.029754f, 0.907923f,  0.265710f,  -0.065505f, 0.226763f,
+  -0.277798f, 0.413292f,  -0.593899f, -0.060740f, -0.313358f, -0.249944f,
+  -0.627329f, -0.327151f, -0.853788f, -1.163807f, -0.388944f, -0.228788f,
+  -0.057382f, 0.334741f,  -0.283083f, 0.368280f,  -0.407197f, -0.441849f,
 };
 
 static const float av1_4_partition_nn_bias_64_layer1[LABEL_SIZE] = {
-  -0.688947f,
-  0.121075f,
-  0.289597f,
-  0.948091f,
+  -0.478735f,
+  0.292948f,
+  0.293172f,
+  0.040013f,
 };
 
 static const NN_CONFIG av1_4_partition_nnconfig_64 = {
@@ -1771,7 +1709,7 @@ static const NN_CONFIG av1_4_partition_nnconfig_64 = {
   LABEL_SIZE,    // num_outputs
   1,             // num_hidden_layers
   {
-      16,  // num_hidden_nodes
+      24,  // num_hidden_nodes
   },
   {
       av1_4_partition_nn_weights_64_layer0,
@@ -1786,8 +1724,725 @@ static const NN_CONFIG av1_4_partition_nnconfig_64 = {
 #undef FEATURE_SIZE
 #undef LABEL_SIZE
 
+#define FEATURE_SIZE 4
+static const float
+    av1_partition_breakout_nn_weights_128_layer0[FEATURE_SIZE * 32] = {
+      -0.331785f,  0.068675f,  -0.323814f,  0.033714f,  -0.237835f, 0.166316f,
+      -0.498766f,  -0.545634f, -0.266173f,  -0.476957f, -0.120409f, -0.021042f,
+      0.124056f,   -0.278750f, -0.110120f,  -0.372812f, 4.547939f,  0.097618f,
+      -0.002710f,  -0.064169f, -1.841173f,  -0.403833f, 0.005536f,  0.067188f,
+      -0.434935f,  -0.227421f, -0.000011f,  -0.139961f, -0.174056f, -0.652384f,
+      -0.000015f,  -0.262847f, -3.319706f,  -0.947693f, 0.002981f,  0.016717f,
+      -10.408850f, -0.014568f, -0.000018f,  0.019084f,  1.523383f,  0.074525f,
+      -0.002076f,  -0.020734f, 4.881495f,   0.002799f,  0.000342f,  -0.019623f,
+      1.786154f,   0.037462f,  -0.019037f,  0.052833f,  11.408153f, -0.044602f,
+      0.026155f,   -0.518627f, -0.474499f,  -0.427430f, -0.442733f, -0.011116f,
+      -22.379410f, -0.000549f, -0.001418f,  0.008090f,  -0.295090f, -0.230268f,
+      -0.337278f,  -0.001127f, -0.644282f,  -0.598783f, -0.539417f, -0.003303f,
+      9.189824f,   0.038066f,  -0.004097f,  -0.460045f, -0.308858f, -0.242691f,
+      -0.230835f,  -0.273057f, 0.152226f,   0.179239f,  -0.146382f, -0.004655f,
+      -0.242940f,  -0.718862f, -0.001685f,  -0.214736f, 3.263186f,  0.079463f,
+      -0.003854f,  -0.187461f, -0.599144f,  -0.419808f, -0.000597f, -0.136980f,
+      0.184813f,   -0.319525f, -0.007246f,  0.079709f,  -0.883229f, -0.343748f,
+      -0.000077f,  -0.172214f, -0.548759f,  -0.194674f, -0.144786f, 0.043896f,
+      -0.176364f,  -0.248394f, -0.090215f,  -0.294743f, -0.280980f, -0.181436f,
+      -0.115681f,  -0.071915f, -13.035494f, -0.075623f, 0.017052f,  -0.171152f,
+      5.910803f,   0.128344f,  0.010256f,   -1.073301f, 2.387826f,  0.166183f,
+      -0.007193f,  -0.257836f,
+    };
+
+static const float av1_partition_breakout_nn_bias_128_layer0[32] = {
+  0.115591f,  -0.100178f, -0.165523f, -0.122997f, 11.045759f,  1.034761f,
+  -0.323672f, -0.189087f, 2.850950f,  7.010029f,  -21.447067f, 1.877031f,
+  0.437442f,  5.929414f,  -0.117274f, 4.462253f,  -0.135198f,  -0.145927f,
+  8.727211f,  0.000000f,  -3.532987f, -0.405898f, 11.364439f,  -0.141728f,
+  -5.994947f, -0.362574f, 1.857687f,  -0.100400f, -0.130312f,  0.006080f,
+  0.429660f,  -8.439470f,
+};
+
+static const float av1_partition_breakout_nn_weights_128_layer1[32] = {
+  -0.013738f, 0.022052f,  -0.074437f, -0.211377f, -0.080433f, 0.015543f,
+  0.002091f,  0.014252f,  0.134834f,  0.190263f,  0.244175f,  -0.031747f,
+  0.020068f,  -0.068326f, 0.185471f,  0.660268f,  -0.134898f, -0.010376f,
+  -0.276023f, -0.282921f, -0.022769f, 0.007070f,  -0.186235f, 0.024407f,
+  -0.024837f, 0.005764f,  0.016599f,  -0.040077f, 0.020990f,  0.095054f,
+  -0.039662f, 0.131499f,
+};
+
+static const float av1_partition_breakout_nn_bias_128_layer1[1] = {
+  0.86678213f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_128 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      32,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_128_layer0,
+      av1_partition_breakout_nn_weights_128_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_128_layer0,
+      av1_partition_breakout_nn_bias_128_layer1,
+  },
+};
+
+static const float
+    av1_partition_breakout_nn_weights_64_layer0[FEATURE_SIZE * 16] = {
+      0.872892f,  -0.235539f, -0.412159f, -0.142533f, -2.251479f, -0.057073f,
+      -0.001373f, 0.112147f,  5.281734f,  0.060704f,  0.000838f,  -0.961554f,
+      0.244995f,  0.154515f,  -0.292654f, -0.167177f, -3.759112f, -0.486347f,
+      0.003208f,  -0.418226f, 2.618152f,  0.026832f,  0.003988f,  -0.404406f,
+      -0.405434f, 0.102791f,  -0.033406f, -0.029820f, -4.492342f, -0.154291f,
+      0.012947f,  -0.195075f, 0.009311f,  -0.411410f, -0.010986f, -0.554822f,
+      0.160576f,  0.020796f,  -0.457230f, -0.191111f, -7.759542f, -0.065039f,
+      -0.001322f, 0.055691f,  0.291924f,  -0.053076f, -0.148379f, -0.298383f,
+      1.022023f,  -0.033668f, -0.000804f, -0.825778f, -3.902254f, -0.085812f,
+      -0.052520f, -0.035012f, -0.465468f, -0.319231f, -0.497529f, -0.183068f,
+      -2.407131f, -0.062304f, 0.000874f,  0.108786f,
+    };
+
+static const float av1_partition_breakout_nn_bias_64_layer0[16] = {
+  0.081425f,  -14.404084f, 11.511393f, -0.930053f, 1.841889f,  15.020920f,
+  -1.872288f, 5.392535f,   -0.329335f, -0.005358f, 12.600776f, 0.000000f,
+  -0.337413f, 4.492778f,   0.000000f,  17.043072f,
+};
+
+static const float av1_partition_breakout_nn_weights_64_layer1[16] = {
+  -0.465338f, -0.103023f, -0.174808f, -0.005156f, -0.016366f, -0.172494f,
+  0.014185f,  0.067030f,  -0.001939f, -0.175049f, 0.245992f,  -0.181660f,
+  -0.038572f, 0.307899f,  -0.294283f, 0.118323f,
+};
+
+static const float av1_partition_breakout_nn_bias_64_layer1[1] = {
+  -1.33438122f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_64 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      16,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_64_layer0,
+      av1_partition_breakout_nn_weights_64_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_64_layer0,
+      av1_partition_breakout_nn_bias_64_layer1,
+  },
+};
+
+static const float
+    av1_partition_breakout_nn_weights_32_layer0[FEATURE_SIZE * 16] = {
+      -4.825528f, -0.145737f, 0.001907f,  0.145415f,  -1.858153f, -0.080744f,
+      0.000601f,  0.211991f,  0.384265f,  -0.043945f, -0.521332f, -0.170622f,
+      -0.046866f, -0.600506f, -0.001216f, -0.332760f, -0.447677f, -0.605844f,
+      -0.121008f, -0.119936f, -0.215739f, -0.269665f, -0.668587f, 0.071318f,
+      -1.202551f, -0.729727f, -0.370084f, 0.088215f,  -1.926800f, -0.086519f,
+      0.000359f,  0.215120f,  0.718749f,  0.022942f,  0.003840f,  -0.176518f,
+      1.213451f,  0.080786f,  0.001557f,  -1.053430f, 0.202698f,  -0.583919f,
+      -0.535512f, -0.239927f, -0.110151f, -0.128832f, -0.441087f, -0.145575f,
+      -0.178518f, -0.585784f, 0.000029f,  -0.833014f, -0.331358f, -0.520297f,
+      -0.088676f, -0.178487f, -1.430755f, 0.022981f,  -0.106931f, 0.015573f,
+      -0.520814f, -0.045386f, -0.443123f, -0.484209f,
+    };
+
+static const float av1_partition_breakout_nn_bias_32_layer0[16] = {
+  11.747026f, -9.337718f, 0.341648f, -0.155847f, -0.104005f, 4.666283f,
+  6.669584f,  16.625504f, 9.885626f, 15.439183f, -0.346080f, 0.000000f,
+  -0.423808f, 0.000000f,  6.352258f, -0.155787f,
+};
+
+static const float av1_partition_breakout_nn_weights_32_layer1[16] = {
+  0.168561f,  -0.122519f, 0.524667f,  0.032474f,  0.059097f,  0.011900f,
+  0.166445f,  0.127256f,  -0.034838f, -0.212586f, -0.317973f, 0.348419f,
+  -0.004171f, 0.157694f,  0.117845f,  0.272115f,
+};
+
+static const float av1_partition_breakout_nn_bias_32_layer1[1] = {
+  0.09049262f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_32 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      16,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_32_layer0,
+      av1_partition_breakout_nn_weights_32_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_32_layer0,
+      av1_partition_breakout_nn_bias_32_layer1,
+  },
+};
+
+static const float
+    av1_partition_breakout_nn_weights_16_layer0[FEATURE_SIZE * 16] = {
+      0.209371f,  0.028758f,  0.005764f,  -0.384401f, -0.625777f, -0.005647f,
+      -0.316867f, 0.042985f,  0.127344f,  0.025461f,  0.011465f,  -0.071043f,
+      -0.295977f, -0.076093f, -0.209681f, -0.311653f, -0.147538f, 0.009910f,
+      -0.130997f, -0.012326f, 0.024124f,  -0.323578f, -0.005790f, -0.085664f,
+      -1.575066f, -0.119221f, 0.015018f,  0.187204f,  0.238117f,  0.084924f,
+      -0.004444f, -1.271538f, -0.709860f, -0.006226f, -0.903111f, 0.090573f,
+      -0.278642f, -0.011114f, 0.021162f,  0.081290f,  -0.467486f, -0.040771f,
+      -0.224069f, -0.714390f, -0.281905f, -0.001336f, -0.761212f, -0.060385f,
+      -0.814479f, -0.050450f, -0.003666f, 0.085668f,  -0.272589f, 0.057330f,
+      -0.206540f, -0.303418f, 0.075335f,  -0.180468f, -0.064872f, -0.755948f,
+      -0.509287f, -0.048877f, -0.001512f, 0.077086f,
+    };
+
+static const float av1_partition_breakout_nn_bias_16_layer0[16] = {
+  16.421495f, 4.012273f,  -1.828571f, 0.000000f,  -0.263564f, -0.201972f,
+  6.564987f,  14.651000f, -3.227779f, 2.241833f,  -0.137116f, 0.762876f,
+  5.625762f,  0.615822f,  0.040057f,  16.668884f,
+};
+
+static const float av1_partition_breakout_nn_weights_16_layer1[16] = {
+  -0.096440f, 0.184316f,  -0.021148f, 0.424974f, 0.003743f,  0.006310f,
+  0.046266f,  -0.219224f, -0.087004f, 0.024623f, -0.275798f, 0.120164f,
+  0.269773f,  -0.021105f, -0.146698f, 0.188764f,
+};
+
+static const float av1_partition_breakout_nn_bias_16_layer1[1] = {
+  1.60751927f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_16 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      16,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_16_layer0,
+      av1_partition_breakout_nn_weights_16_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_16_layer0,
+      av1_partition_breakout_nn_bias_16_layer1,
+  },
+};
+
+static const float
+    av1_partition_breakout_nn_weights_8_layer0[FEATURE_SIZE * 16] = {
+      -0.255885f, 0.109548f,  -0.111054f, -0.476119f, -1.083031f, -0.342003f,
+      0.048241f,  -0.356013f, -0.085054f, 0.124908f,  0.000084f,  -0.149906f,
+      -0.729829f, 0.133535f,  -0.002125f, 0.207516f,  -0.210163f, -0.567365f,
+      -0.590103f, 0.045308f,  -0.539406f, 0.130550f,  -0.663879f, -0.170549f,
+      0.017587f,  -0.054187f, 0.000550f,  0.038297f,  -0.112891f, -0.012751f,
+      -0.048067f, 0.095564f,  0.079892f,  0.077285f,  -0.749708f, -0.286312f,
+      -0.054334f, 0.132242f,  -0.004152f, -0.209758f, -0.073407f, 0.082306f,
+      -0.001034f, -0.090990f, 0.122823f,  -0.109794f, -0.230066f, -0.391155f,
+      -0.262245f, -0.004744f, -0.232246f, 0.099290f,  -0.637484f, 0.111937f,
+      -0.548556f, -0.598344f, 0.123265f,  -0.281395f, -0.399711f, -0.525671f,
+      -0.596269f, 0.098494f,  -0.005765f, 0.173652f,
+    };
+
+static const float av1_partition_breakout_nn_bias_8_layer0[16] = {
+  0.194141f, -0.111223f, 2.503733f, -7.155602f, -0.695068f, 0.114874f,
+  2.056990f, 5.284306f,  0.639643f, -2.792049f, -2.232339f, -0.232209f,
+  2.336705f, -0.278834f, 0.231905f, 7.954366f,
+};
+
+static const float av1_partition_breakout_nn_weights_8_layer1[16] = {
+  -0.014439f, 0.010171f, 0.048116f,  -0.090659f, -0.081235f, -0.021840f,
+  -0.017360f, 0.031063f, -0.031737f, -0.023439f, -0.037725f, 0.021954f,
+  0.055858f,  0.230970f, -0.056466f, 0.119780f,
+};
+
+static const float av1_partition_breakout_nn_bias_8_layer1[1] = {
+  1.27784479f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_8 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      16,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_8_layer0,
+      av1_partition_breakout_nn_weights_8_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_8_layer0,
+      av1_partition_breakout_nn_bias_8_layer1,
+  },
+};
+#undef FEATURE_SIZE
+
+#define FEATURE_SIZE 9  // Input layer size
+#define NUM_NODES 32    // Hidden layer size
+#define LABEL_SIZE 3    // Output layer size
+
+static const float av1_rect_partition_nn_weights_8_layer0[FEATURE_SIZE *
+                                                          NUM_NODES] = {
+  0.22151f,  0.99424f,  0.23415f,  -1.13841f, -0.11277f, 0.09530f,  0.14769f,
+  -1.18895f, -0.96640f, -0.21421f, -0.13974f, 0.03236f,  0.15777f,  -0.03176f,
+  0.02729f,  -0.37344f, -0.01727f, -0.05469f, 0.19402f,  -3.45508f, 0.90106f,
+  -2.91557f, 0.19379f,  0.14356f,  -0.13291f, 0.05734f,  -0.03032f, -0.13060f,
+  0.35744f,  1.31630f,  -1.54493f, -0.20749f, -0.24413f, -0.04524f, -0.12400f,
+  1.08305f,  -0.21596f, 0.76244f,  1.10616f,  -1.71706f, 0.05768f,  0.10966f,
+  0.00949f,  -0.12680f, 0.00699f,  -0.11522f, -0.38566f, 0.34283f,  -0.35266f,
+  -0.40643f, -0.22462f, 0.32300f,  -0.39737f, -0.20587f, -0.16096f, 1.07543f,
+  0.30314f,  -1.35659f, -0.38212f, 0.45857f,  0.76615f,  0.16819f,  -1.24459f,
+  0.39677f,  0.87436f,  -2.33757f, 1.27471f,  0.27488f,  0.01019f,  -0.01221f,
+  -0.07461f, -0.14577f, -0.01231f, -0.64426f, -1.02733f, -1.96242f, 0.95143f,
+  -0.06777f, -1.13868f, 0.01354f,  -0.75590f, -0.78222f, -0.07453f, 0.61788f,
+  0.56899f,  1.17144f,  0.70899f,  0.48568f,  0.11266f,  0.81579f,  -0.03929f,
+  0.01088f,  0.33599f,  -0.22401f, -0.49654f, -0.02598f, 0.04509f,  -0.08217f,
+  -0.30687f, 0.19851f,  -2.96860f, -2.30698f, 0.01848f,  0.11801f,  0.06614f,
+  0.01673f,  -0.11002f, -0.08168f, 0.09204f,  -0.06379f, 0.27972f,  -0.31716f,
+  -0.00566f, -0.13651f, -0.37276f, 0.01511f,  -0.23697f, 0.21696f,  -0.19480f,
+  0.60758f,  -0.43506f, -0.02247f, -1.45073f, 0.84442f,  -0.94018f, 0.32550f,
+  0.03985f,  -0.06581f, 0.21665f,  0.79472f,  -2.41080f, 0.04788f,  -0.09492f,
+  -0.10677f, 0.07250f,  0.14329f,  -0.37319f, 0.53043f,  -0.49108f, 0.25792f,
+  -0.36569f, -0.28669f, -0.18416f, -0.52385f, -1.17081f, -1.32153f, -1.13403f,
+  -0.26196f, 0.93379f,  0.72115f,  0.54464f,  0.27642f,  0.04757f,  2.01629f,
+  1.55787f,  -0.11665f, 1.00722f,  -0.24352f, 0.53308f,  0.57719f,  0.39344f,
+  0.19174f,  0.06339f,  -0.02530f, 0.07724f,  -0.32416f, -0.26992f, -0.35887f,
+  -0.35285f, -0.33379f, -0.37475f, -0.77335f, 1.70027f,  -1.52153f, -0.26503f,
+  0.97552f,  -2.96705f, -0.91220f, -0.11827f, 0.00406f,  -0.14514f, 0.18417f,
+  -0.20874f, 0.27293f,  -0.34072f, -0.34838f, -0.19054f, -0.29806f, -0.27960f,
+  -0.19293f, -0.18275f, -0.05902f, 0.58625f,  -0.05470f, -0.48814f, -0.45382f,
+  -0.05959f, 2.01250f,  -0.30014f, 0.69546f,  -1.24180f, 1.34923f,  0.20337f,
+  0.16850f,  0.07187f,  0.72630f,  -0.15380f, -2.40973f, -2.73561f, -1.71375f,
+  -1.61695f, 0.50052f,  0.09730f,  0.00579f,  0.06133f,  -0.06512f, -0.61439f,
+  -1.16173f, -0.58716f, 1.60438f,  0.23242f,  0.91847f,  0.49041f,  -0.16277f,
+  -0.02574f, -0.64593f, 1.17028f,  0.46852f,  0.14926f,  0.73853f,  -0.78521f,
+  0.05959f,  -0.35590f, 0.02039f,  0.10812f,  -0.28650f, 1.34038f,  -0.72188f,
+  0.62385f,  -0.35271f, -0.39599f, 0.41543f,  0.53124f,  -0.23510f, -0.15480f,
+  -0.05066f, -0.33529f, 0.05238f,  -0.35311f, -0.26983f, -0.39764f, 0.01085f,
+  0.26593f,  -0.18411f, -0.29945f, 0.50090f,  -0.03397f, 0.78562f,  -0.33068f,
+  1.21308f,  -2.23273f, -0.33366f, -0.15164f, -1.13270f, 0.17394f,  0.65567f,
+  0.76496f,  0.44325f,  0.01368f,  -0.33619f, -0.64256f, 0.64478f,  0.84553f,
+  1.74183f,  0.22563f,  -0.14550f, -0.16258f, 0.03010f,  0.49922f,  0.64575f,
+  -0.29187f, -0.10348f, -1.43619f, -0.56540f, -0.14779f, 0.04616f,  0.87411f,
+  -1.08228f,
+};
+
+static const float av1_rect_partition_nn_bias_8_layer0[NUM_NODES] = {
+  0.33919f,  -0.03003f, 0.79073f,  -0.18508f, 0.00668f,  -0.12017f, 0.35362f,
+  -0.51642f, 0.06536f,  0.41668f,  -0.06509f, 0.94606f,  -0.15385f, 0.14936f,
+  1.46274f,  -0.06961f, 2.82537f,  -1.95576f, -0.09457f, 0.02042f,  -0.07480f,
+  -0.55083f, 0.26170f,  4.39883f,  0.33999f,  -0.10502f, 0.70884f,  -0.06992f,
+  -0.22638f, 1.40940f,  -0.09309f, 0.05828f,
+};
+
+static const float av1_rect_partition_nn_weights_8_layer1[NUM_NODES *
+                                                          LABEL_SIZE] = {
+  0.09209f,  0.26236f,  0.62136f,  0.76324f,  -1.14678f, 0.42289f,  -0.08895f,
+  -0.97267f, 2.05958f,  0.00843f,  0.35335f,  1.12096f,  -0.11679f, 0.07350f,
+  -1.23231f, -0.61990f, 1.51379f,  -1.99450f, 0.22441f,  2.41974f,  -0.30488f,
+  -0.37869f, 0.47168f,  -3.70132f, 0.00061f,  0.19432f,  0.11512f,  0.26200f,
+  -0.35285f, 0.37985f,  0.90571f,  0.27344f,  0.74840f,  -0.17965f, -2.51433f,
+  0.59235f,  1.16670f,  -0.53446f, 0.67897f,  0.04505f,  -0.86874f, 0.45361f,
+  -0.35033f, 1.21283f,  0.31426f,  -0.20841f, 0.56757f,  0.45909f,  -1.23683f,
+  0.09835f,  -0.17214f, -0.96323f, 0.01138f,  -0.50233f, 0.30104f,  2.01814f,
+  1.15821f,  -0.11947f, 0.74574f,  -0.30714f, -0.39646f, -1.30086f, -0.88541f,
+  -0.12259f, -0.54977f, 0.30069f,  1.84299f,  -0.95141f, -0.65887f, -0.25888f,
+  -0.63265f, 1.29531f,  -0.56672f, 0.10837f,  -0.21297f, -2.19131f, 0.01156f,
+  0.51912f,  0.46704f,  0.42810f,  -0.59271f, 0.98469f,  -0.17914f, -1.91163f,
+  -0.32807f, 0.48199f,  -0.99525f, 1.67108f,  -0.87631f, -0.60258f, -0.78731f,
+  -0.32877f, 0.44237f,  0.01087f,  0.07489f,  -0.28224f,
+};
+
+static const float av1_rect_partition_nn_bias_8_layer1[LABEL_SIZE] = {
+  1.70665f,
+  -0.77954f,
+  -0.92709f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_8 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_8_layer0,
+    av1_rect_partition_nn_weights_8_layer1 },
+  { av1_rect_partition_nn_bias_8_layer0, av1_rect_partition_nn_bias_8_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_16_layer0[FEATURE_SIZE *
+                                                           NUM_NODES] = {
+  -0.18480f, -0.05410f, -0.18957f, 0.15451f,  -0.38649f, -0.26162f, -0.22727f,
+  -0.38555f, -0.36738f, 0.74384f,  -1.85999f, 0.98491f,  -0.72119f, 1.77321f,
+  0.39983f,  0.96314f,  0.23695f,  0.30200f,  0.30629f,  -0.47617f, -1.43320f,
+  -1.81730f, 0.36554f,  -0.07142f, -1.27242f, -1.27697f, 0.00110f,  -0.32179f,
+  0.27460f,  0.45428f,  0.15308f,  -0.73906f, -0.28577f, -0.01238f, -0.16958f,
+  -0.85390f, 1.05484f,  -1.62812f, 0.77632f,  -0.27327f, -0.32527f, 0.32726f,
+  1.73255f,  0.53763f,  0.59121f,  -0.39068f, -0.32451f, -0.31869f, 0.17777f,
+  0.07519f,  -0.18066f, -0.11250f, -0.14616f, -0.16882f, -0.04099f, -0.67959f,
+  0.39674f,  -0.08596f, 0.18587f,  -2.04097f, -1.73993f, 1.57212f,  1.42410f,
+  -1.36762f, -0.41485f, -1.12103f, 0.56959f,  0.11500f,  0.48945f,  -0.13585f,
+  1.22125f,  0.67071f,  -1.11812f, -0.20660f, -0.52856f, 0.70663f,  0.74382f,
+  0.61114f,  -0.11454f, 1.14687f,  0.80322f,  -0.45965f, -0.44466f, -0.05830f,
+  0.13206f,  -0.53750f, -0.11324f, -0.37971f, -0.13491f, -0.21268f, 1.93407f,
+  1.34433f,  2.49427f,  2.91955f,  1.71730f,  0.03295f,  0.03587f,  -0.14550f,
+  0.08189f,  -0.38655f, -0.35432f, -0.62706f, -0.01849f, -0.57882f, -0.60438f,
+  -1.01334f, -0.57302f, 0.22592f,  0.05916f,  -0.05305f, -0.89824f, -0.52969f,
+  -0.24542f, 0.27029f,  -0.40924f, -0.82452f, -0.60665f, -5.03025f, 0.83302f,
+  1.83695f,  2.19716f,  2.31001f,  0.03657f,  0.00063f,  -0.04379f, 0.05835f,
+  -0.08623f, 0.20557f,  -0.17791f, 0.07874f,  -0.25456f, -0.19513f, -0.27753f,
+  -0.31982f, 0.00245f,  -0.33183f, 0.26059f,  -0.22165f, 0.37582f,  -0.30411f,
+  -0.22639f, -0.14739f, -0.20201f, -0.37507f, -1.30653f, 0.49570f,  1.03673f,
+  0.66139f,  0.44941f,  -0.44461f, -0.50376f, -0.49664f, 0.18608f,  -0.26175f,
+  0.14844f,  0.78715f,  -0.70344f, -0.87624f, -0.98535f, -0.35346f, 0.37094f,
+  -0.43135f, -0.22571f, 3.46263f,  3.13580f,  -1.33203f, -0.15247f, -0.15866f,
+  -0.11214f, 0.12211f,  0.03964f,  -1.87597f, -4.81597f, -4.80195f, -4.98096f,
+  -5.62336f, -0.05337f, -0.00943f, 0.00792f,  0.02742f,  1.05679f,  2.41455f,
+  0.85382f,  1.42504f,  0.58096f,  0.21443f,  1.02694f,  1.06746f,  1.20242f,
+  0.60767f,  1.98667f,  -0.80879f, -0.63495f, 1.95508f,  0.23952f,  -0.15019f,
+  -0.16097f, 0.30155f,  -3.42407f, -1.34998f, 9.07689f,  -2.22559f, 2.22562f,
+  -0.03348f, -0.05229f, 0.05931f,  0.03042f,  -0.18068f, -0.05732f, -0.33010f,
+  -0.32279f, -0.26607f, -0.02723f, -0.04067f, 0.08700f,  -0.16366f, -0.24935f,
+  -0.69124f, 0.58508f,  0.50654f,  0.04492f,  1.38340f,  -1.51487f, 1.72889f,
+  -1.95618f, -3.65013f, -1.38525f, -3.05516f, -2.40448f, 2.47467f,  0.03784f,
+  0.08052f,  -0.01971f, -0.08918f, -0.84997f, -0.55302f, -1.07861f, -0.62626f,
+  0.61751f,  -0.11012f, -0.24185f, -0.39201f, -1.85390f, -0.31261f, -0.11927f,
+  0.15671f,  -0.23450f, -0.14916f, -0.31715f, -0.19350f, 0.01795f,  -0.11533f,
+  -0.05799f, -0.03142f, 0.20218f,  -0.39499f, -0.33859f, -0.13201f, -0.19527f,
+  -0.28459f, -0.20346f, 0.89457f,  -2.22103f, -2.37455f, -2.00221f, 2.44553f,
+  0.33915f,  0.50047f,  -0.34625f, -0.19667f, -0.56333f, -0.84328f, 1.25767f,
+  -1.70297f, 1.00482f,  -0.00103f, -1.40813f, 0.21311f,  0.39230f,  -0.07302f,
+  -3.49100f, 1.60675f,  -2.90692f, 0.11022f,  0.13507f,  -0.13308f, 0.15201f,
+  -0.05573f,
+};
+
+static const float av1_rect_partition_nn_bias_16_layer0[NUM_NODES] = {
+  -0.16783f, -0.16023f, 0.52215f,  -0.04109f, 2.00122f,  -0.11633f, 0.25535f,
+  1.80638f,  1.69273f,  -0.25998f, -6.83550f, -0.79682f, -1.03466f, 1.42721f,
+  0.00000f,  -0.00000f, -0.11665f, -0.12047f, -1.01497f, 7.27181f,  -0.78548f,
+  -1.39335f, -5.42248f, -0.10388f, 0.07634f,  2.81012f,  -0.57429f, -0.15629f,
+  -0.12044f, 1.65478f,  -0.75153f, 1.18441f,
+};
+
+static const float av1_rect_partition_nn_weights_16_layer1[NUM_NODES *
+                                                           LABEL_SIZE] = {
+  -0.26407f, 0.06322f,  0.87932f,  0.17772f,  0.71686f,  -0.12283f, 0.08454f,
+  0.20098f,  -0.31763f, -0.33178f, -4.59535f, -0.04367f, 0.17099f,  3.80486f,
+  0.16750f,  0.29218f,  0.57234f,  -0.96550f, -0.10599f, -4.91130f, -0.14658f,
+  0.95803f,  -4.13925f, 0.24567f,  0.25708f,  1.60547f,  -1.03251f, -0.31053f,
+  -0.05659f, -0.94121f, -0.68926f, -0.24738f, -0.38019f, 0.98950f,  0.13689f,
+  0.24504f,  0.49623f,  0.19980f,  0.38349f,  0.37481f,  0.54540f,  -0.02198f,
+  3.43385f,  1.02543f,  -0.40921f, -3.07235f, 0.02996f,  0.00323f,  -0.35414f,
+  0.71099f,  1.39334f,  2.43741f,  -1.11007f, -0.22739f, -4.21757f, 0.11905f,
+  0.00353f,  -1.69637f, 0.45944f,  -0.19884f, 0.03624f,  0.25729f,  0.23659f,
+  -2.08405f, 0.08573f,  -0.53393f, -1.28103f, -0.53970f, -0.65465f, 0.31821f,
+  -0.09884f, -0.69026f, -0.37284f, 0.04622f,  1.32973f,  -0.15414f, 0.19138f,
+  -0.67927f, -0.17658f, 0.36008f,  -0.51832f, 0.09887f,  -1.94414f, 2.95227f,
+  1.76937f,  -0.26687f, 8.50976f,  0.26247f,  0.60262f,  -0.27910f, 0.30061f,
+  -0.05117f, 0.16018f,  0.71195f,  0.57871f,  1.57794f,
+};
+
+static const float av1_rect_partition_nn_bias_16_layer1[3] = {
+  2.68750f,
+  -1.31894f,
+  -1.36768f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_16 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_16_layer0,
+    av1_rect_partition_nn_weights_16_layer1 },
+  { av1_rect_partition_nn_bias_16_layer0, av1_rect_partition_nn_bias_16_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_32_layer0[FEATURE_SIZE *
+                                                           NUM_NODES] = {
+  -0.54654f, -0.43537f, -0.10620f, -0.48051f, -0.43543f, -0.22737f, -0.15429f,
+  -0.09858f, -0.09438f, 0.37306f,  0.23934f,  -1.86375f, -1.18307f, -0.32995f,
+  -0.09745f, 0.05431f,  -0.13799f, 0.14734f,  -0.33219f, 0.18057f,  -0.23792f,
+  -0.28126f, 0.02977f,  -0.07431f, 0.07860f,  0.00067f,  -0.01927f, 1.01841f,
+  -0.57739f, 0.08412f,  -1.33843f, -1.05563f, -0.28693f, -0.39425f, -0.69572f,
+  -0.16703f, 0.02808f,  0.11994f,  -0.26267f, 0.19706f,  -0.29707f, -0.25305f,
+  -0.07050f, -0.02704f, -0.31528f, -0.42301f, 0.22496f,  -0.37001f, -0.23319f,
+  -0.11139f, -0.30513f, 0.04213f,  -0.12550f, 0.02504f,  0.33245f,  0.01102f,
+  -0.35950f, -0.05949f, -0.19590f, -0.27457f, -0.28339f, -0.15676f, -0.21538f,
+  0.65066f,  0.28443f,  -1.24943f, -3.00246f, -1.01897f, 0.09304f,  0.70052f,
+  -0.12877f, 0.21120f,  -0.37476f, 0.23261f,  -0.28401f, 0.09837f,  0.00020f,
+  -0.12106f, -0.32354f, -0.02472f, -0.19772f, 1.01886f,  0.16596f,  -0.06532f,
+  1.72938f,  1.57754f,  0.55963f,  0.33246f,  -0.20023f, 0.30715f,  0.08629f,
+  0.18945f,  -0.45988f, -1.22610f, -0.05152f, -0.48859f, -1.02104f, -0.27315f,
+  -0.57698f, 0.04157f,  -0.92428f, -1.31268f, 1.78210f,  0.10291f,  1.55042f,
+  -1.26793f, 1.39042f,  -1.43729f, 0.25600f,  5.21263f,  5.31955f,  5.19316f,
+  5.43430f,  0.00294f,  -0.00970f, -0.02333f, 0.00250f,  1.17672f,  6.27544f,
+  4.95973f,  3.54009f,  4.51269f,  0.30750f,  0.78780f,  -0.44741f, -0.76442f,
+  0.75050f,  0.58799f,  0.03400f,  -2.09859f, 1.67313f,  0.12503f,  0.28609f,
+  1.15809f,  2.46530f,  -0.04898f, 0.23072f,  -0.12635f, -0.82097f, -0.63827f,
+  2.16779f,  1.77132f,  0.15434f,  -1.06427f, 0.06206f,  -0.87732f, -0.61897f,
+  -0.44593f, -0.77131f, -0.15979f, -0.02282f, -0.74381f, 0.66052f,  -0.22992f,
+  1.74638f,  1.29199f,  -0.55464f, 0.98316f,  0.06665f,  0.50254f,  -0.66292f,
+  0.17113f,  -0.32633f, -1.85803f, -0.92759f, 4.44965f,  1.33057f,  0.02135f,
+  -0.27446f, -0.26018f, -0.12613f, -0.14470f, -0.23355f, -0.09717f, -0.24123f,
+  -0.05535f, -0.19146f, -0.36222f, -0.30458f, -0.40323f, 0.21779f,  0.14248f,
+  -0.48630f, 0.18840f,  0.11040f,  0.17287f,  -0.51880f, 1.12466f,  -0.38888f,
+  -0.16421f, -0.31784f, -0.36112f, -0.25386f, -0.01636f, 0.10029f,  -0.26881f,
+  -0.17051f, -0.30903f, -0.08573f, -0.28774f, -0.01173f, -0.09706f, -0.23089f,
+  -0.12922f, -0.17463f, -0.12433f, -0.23074f, 0.15220f,  1.29826f,  0.23788f,
+  0.04189f,  2.66416f,  0.48815f,  -0.06803f, 0.96742f,  1.27165f,  -0.70348f,
+  -0.09941f, -0.42948f, -0.20243f, -0.02364f, -0.26689f, -0.40629f, -0.68217f,
+  -0.48073f, 2.43657f,  -2.60191f, -1.82837f, 0.50440f,  0.71829f,  0.76491f,
+  0.28293f,  0.20568f,  0.92642f,  -0.02496f, 1.43637f,  -0.24474f, -1.21030f,
+  0.54084f,  1.05130f,  1.29572f,  0.03750f,  -0.36894f, 0.74548f,  -1.33857f,
+  -0.84858f, 1.35230f,  0.80175f,  0.66136f,  1.06473f,  0.18701f,  1.42413f,
+  0.04661f,  -0.07820f, 0.64990f,  -0.43595f, 1.18304f,  -0.11437f, -0.06365f,
+  0.03558f,  0.78260f,  -1.74890f, 1.56217f,  -1.23424f, 4.59193f,  -3.35072f,
+  0.01180f,  -0.18296f, -0.20870f, 0.04510f,  1.52595f,  -1.37402f, -0.33123f,
+  -0.85957f, 0.80598f,  0.03743f,  0.02354f,  0.37707f,  1.62095f,  -0.29627f,
+  -0.31778f, -0.45789f, -0.14906f, 0.25315f,  -0.10817f, -0.32610f, -0.40890f,
+  0.33984f,
+};
+
+static const float av1_rect_partition_nn_bias_32_layer0[NUM_NODES] = {
+  -0.17482f, 0.39042f,  0.00000f,  1.69677f,  0.08792f,  -0.09301f, 0.13809f,
+  4.84061f,  0.00000f,  0.40515f,  0.46246f,  0.20644f,  -5.77478f, -1.54510f,
+  0.05660f,  -0.32013f, 0.23649f,  0.03778f,  -2.53710f, -0.27869f, 0.45623f,
+  -0.04155f, -0.18445f, -0.73405f, -0.50243f, 2.23191f,  1.93272f,  -1.07032f,
+  -0.27602f, -1.98063f, 0.20816f,  -0.01315f,
+};
+
+static const float av1_rect_partition_nn_weights_32_layer1[NUM_NODES *
+                                                           LABEL_SIZE] = {
+  0.02827f,  1.02560f,  -0.07137f, -0.31911f, 0.11365f,  0.13684f,  -0.07816f,
+  -5.23036f, -0.34340f, 0.84526f,  -1.51845f, 0.07017f,  -8.12570f, 6.24061f,
+  0.35739f,  -0.09937f, -0.30978f, 0.22032f,  0.74968f,  -0.34557f, 0.45547f,
+  -0.16512f, 0.07118f,  1.66415f,  0.41320f,  -1.81533f, -1.96004f, 1.04666f,
+  0.84049f,  4.31009f,  0.68850f,  0.26322f,  -0.24634f, -1.25889f, 0.31952f,
+  0.63632f,  0.05801f,  -0.10664f, -0.21992f, 2.44386f,  0.19526f,  -0.09838f,
+  1.53049f,  -0.26630f, 3.54126f,  -3.40574f, 0.72730f,  0.04557f,  0.92652f,
+  0.15522f,  2.35895f,  -0.13347f, 0.56907f,  0.15352f,  0.01823f,  -0.73939f,
+  0.43104f,  1.90321f,  0.31267f,  -0.51972f, 0.50094f,  -3.98372f, -3.41518f,
+  -0.48183f, 0.26661f,  0.64146f,  0.14500f,  -0.01695f, 0.16653f,  -0.37846f,
+  0.08412f,  2.69714f,  -0.20258f, -0.75786f, 0.11201f,  0.61878f,  4.22231f,
+  -3.55330f, -1.14137f, -0.37722f, -0.28000f, -0.72581f, -2.62827f, -0.19448f,
+  -0.59398f, -0.30136f, -0.17725f, -0.69630f, -0.41132f, 0.12208f,  2.11441f,
+  -1.08794f, -1.41694f, 0.02620f,  2.18792f,  0.04271f,
+};
+
+static const float av1_rect_partition_nn_bias_32_layer1[3] = {
+  2.47332f,
+  -1.65756f,
+  -0.81573f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_32 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_32_layer0,
+    av1_rect_partition_nn_weights_32_layer1 },
+  { av1_rect_partition_nn_bias_32_layer0, av1_rect_partition_nn_bias_32_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_64_layer0[FEATURE_SIZE *
+                                                           NUM_NODES] = {
+  0.08972f,  4.09095f,  -0.31398f, -2.43631f, -0.74767f, 1.42471f,  1.60926f,
+  1.44721f,  1.88259f,  2.35375f,  1.88299f,  2.01109f,  0.98679f,  2.24131f,
+  0.06279f,  -0.08315f, 0.32107f,  0.91334f,  -0.36569f, 5.55049f,  5.44943f,
+  5.20471f,  5.39099f,  -0.01943f, -0.00284f, 0.02203f,  -0.01309f, 1.41917f,
+  6.68460f,  -6.15986f, 6.41341f,  -3.20630f, -0.00567f, -0.00038f, 0.05960f,
+  0.04308f,  0.95366f,  3.48535f,  2.98266f,  4.11784f,  3.44255f,  0.61630f,
+  0.71405f,  0.63945f,  -0.00713f, 0.39193f,  1.91621f,  3.32755f,  0.71674f,
+  -0.11647f, 2.07090f,  2.64191f,  0.07949f,  -0.05023f, 0.99935f,  0.83145f,
+  0.75898f,  -0.98764f, -0.58731f, 1.21734f,  -0.08076f, -3.26780f, 1.66278f,
+  0.04189f,  -0.33177f, -1.58648f, 1.00883f,  -0.56132f, -2.34877f, 0.67056f,
+  -2.32297f, -0.91641f, -1.02909f, 4.19781f,  3.87484f,  4.32778f,  -1.97171f,
+  -0.24734f, 0.00822f,  0.05892f,  0.12697f,  -3.62915f, -2.93127f, 7.94856f,
+  -3.29311f, 3.26001f,  -0.02231f, 0.02741f,  0.05919f,  0.08190f,  -1.49344f,
+  -0.64475f, -0.24627f, 4.03324f,  -1.14799f, -0.18465f, -0.17829f, 0.10394f,
+  0.08580f,  -5.74721f, 4.42467f,  3.63964f,  3.00258f,  -1.22744f, -0.29408f,
+  0.00767f,  0.12305f,  0.05249f,  -0.17166f, -0.20120f, -0.32941f, -0.31901f,
+  0.04628f,  -0.35249f, -0.18272f, 0.03956f,  -0.19329f, -0.33564f, 0.09856f,
+  -0.00173f, -0.31751f, -0.05702f, -0.20558f, -0.31464f, -0.02488f, -0.00729f,
+  -0.35854f, -0.14762f, -0.34897f, -0.12746f, 0.04011f,  -0.24918f, -0.53516f,
+  -0.28440f, -0.36789f, -1.34889f, -9.10044f, -9.19238f, 4.48042f,  6.54429f,
+  -0.00226f, 0.00430f,  0.00321f,  0.00442f,  0.87551f,  -0.16224f, -0.22832f,
+  -0.60640f, -0.28738f, 0.18062f,  0.22008f,  -0.47406f, 0.80302f,  0.12149f,
+  1.49530f,  1.05069f,  -2.02985f, -0.92833f, 0.25616f,  0.12852f,  3.51840f,
+  0.25226f,  -2.63283f, -4.04386f, 8.46300f,  -2.93408f, 0.44069f,  0.08276f,
+  0.34482f,  -0.22615f, 0.28666f,  3.02962f,  -1.20055f, -1.04832f, -0.97632f,
+  -0.99530f, 1.44196f,  1.68550f,  0.49360f,  1.08155f,  -0.26059f, -0.02876f,
+  -0.27492f, -0.06205f, -0.09496f, -0.12314f, -0.30228f, -0.07453f, -0.38857f,
+  1.17443f,  2.41497f,  1.90537f,  2.37716f,  2.91495f,  -0.44455f, -0.51176f,
+  0.48195f,  0.53032f,  0.23696f,  -1.06211f, 1.47459f,  -0.89029f, 0.29521f,
+  0.66291f,  -0.42653f, 1.82308f,  -1.30372f, -0.36192f, -3.40388f, -1.61476f,
+  -2.29745f, -0.66886f, -2.08252f, -0.54552f, -4.06849f, 0.02948f,  0.27297f,
+  -4.81472f, 4.60404f,  -0.11053f, 0.14765f,  0.02826f,  -0.14688f, -0.07066f,
+  -0.01224f, 1.20377f,  7.02725f,  -6.02627f, 6.87255f,  -3.14257f, 0.01074f,
+  0.02397f,  -0.02359f, 0.01901f,  0.14956f,  -1.67671f, 2.26714f,  2.57043f,
+  -0.45888f, -1.60265f, -2.11475f, -2.74029f, -2.74658f, -0.35630f, -2.63013f,
+  -2.14814f, -0.67266f, -1.56850f, 0.57137f,  -1.14428f, -0.34265f, -0.12521f,
+  0.01220f,  -0.74906f, -0.19270f, 0.68110f,  -0.24737f, -0.70568f, -1.64826f,
+  -0.35847f, -0.15984f, -1.17932f, -8.72306f, -8.72834f, 3.93701f,  6.17812f,
+  -0.03191f, -0.00104f, 0.01402f,  -0.00046f, -0.94517f, 1.51266f,  -0.56318f,
+  0.72260f,  -0.09253f, -0.09069f, -2.16695f, -0.23653f, 0.24418f,  2.21148f,
+  -1.47954f, -1.01439f, 0.31536f,  0.77238f,  -0.85083f, -0.15758f, -0.50886f,
+  0.09101f,
+};
+
+static const float av1_rect_partition_nn_bias_64_layer0[NUM_NODES] = {
+  0.91706f,  -1.31328f, -5.16196f, 1.13191f,  -0.98044f, -1.61122f, 1.03039f,
+  -0.98537f, -4.45568f, -4.34802f, -0.92116f, 0.66836f,  -0.10752f, -0.13065f,
+  -0.35567f, -0.35693f, 1.74941f,  1.17379f,  -3.45555f, 5.66321f,  -0.24917f,
+  -1.11940f, -0.73656f, -0.19299f, -0.04181f, 1.11010f,  -2.97859f, -0.16774f,
+  0.59835f,  -0.31269f, -0.30585f, -1.66212f,
+};
+
+static const float av1_rect_partition_nn_weights_64_layer1[NUM_NODES *
+                                                           LABEL_SIZE] = {
+  0.58963f,  4.20320f,  -8.62465f, -6.54014f, 5.41108f,  2.33581f,   -0.10354f,
+  -1.17753f, -3.45909f, -2.24722f, 2.20881f,  3.21971f,  -0.09087f,  -0.21624f,
+  0.16529f,  -8.40985f, -1.60205f, -1.41538f, 4.41826f,  -4.63069f,  -0.27742f,
+  4.08710f,  0.26439f,  -1.46028f, 0.51234f,  6.25212f,  -3.35650f,  -1.21348f,
+  1.37201f,  8.89151f,  0.28859f,  -0.97328f, -0.36196f, -2.71701f,  4.54196f,
+  -0.62476f, -2.43814f, -1.34209f, 0.12850f,  1.73859f,  3.09809f,   -4.42434f,
+  -1.82552f, -3.66420f, -0.31535f, 0.00968f,  -0.02019f, 9.66824f,   0.58835f,
+  1.50425f,  2.84487f,  2.55522f,  0.01409f,  -2.27594f, -0.31800f,  0.91076f,
+  -0.66808f, 0.33120f,  -0.12460f, 0.64457f,  -0.36416f, -10.30843f, 1.51013f,
+  2.06861f,  -0.20989f, -0.87119f, 3.68642f,  7.33662f,  -2.88037f,  -0.52414f,
+  -0.35036f, -0.45947f, -0.07406f, 6.46346f,  -0.16031f, 0.27071f,   0.38845f,
+  -0.21940f, 0.08583f,  -1.39526f, 0.50554f,  0.45279f,  -6.61856f,  1.84069f,
+  -0.19149f, -1.77235f, 0.75136f,  1.11797f,  0.32677f,  -7.10427f,  3.82908f,
+  1.04238f,  -0.91435f, 1.93317f,  -1.84946f, -0.48909f,
+};
+
+static const float av1_rect_partition_nn_bias_64_layer1[3] = {
+  0.32215f,
+  -0.57522f,
+  0.25314f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_64 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_64_layer0,
+    av1_rect_partition_nn_weights_64_layer1 },
+  { av1_rect_partition_nn_bias_64_layer0, av1_rect_partition_nn_bias_64_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_128_layer0[FEATURE_SIZE *
+                                                            NUM_NODES] = {
+  -0.70901f, -3.03481f, 3.30604f,  -1.28803f,  -0.08610f, -0.33320f, -0.30716f,
+  0.25100f,  0.14323f,  -0.98422f, -0.89084f,  -0.24508f, -1.10785f, -0.82524f,
+  0.11766f,  -0.42777f, 1.08965f,  4.35125f,   -1.19388f, 4.22042f,  4.96306f,
+  6.32406f,  3.29899f,  -0.90768f, 0.05203f,   0.38467f,  1.74257f,  -0.19918f,
+  -0.11335f, 0.00140f,  -0.42303f, -0.04419f,  0.03583f,  -0.05441f, -0.19586f,
+  0.01484f,  -1.19964f, 0.25497f,  3.04502f,   0.05446f,  -0.23253f, 0.00266f,
+  0.07117f,  -2.78986f, -4.62953f, 1.45331f,   0.43923f,  0.92298f,  -0.47736f,
+  1.49165f,  0.45942f,  -1.99787f, 3.33510f,   0.17234f,  0.04024f,  -1.42780f,
+  0.23566f,  -0.90970f, 1.18041f,  -1.45865f,  2.30878f,  -1.28507f, 1.87290f,
+  1.91186f,  4.74826f,  -3.70735f, 4.49808f,   -4.72275f, -0.02696f, -0.02642f,
+  -0.06093f, -0.01121f, -0.70683f, 2.69737f,   -1.88563f, 2.48637f,  1.10922f,
+  0.74624f,  0.40308f,  2.06396f,  1.39289f,   0.00909f,  -2.05271f, -1.53539f,
+  -1.38323f, 0.83303f,  -0.32250f, 0.51172f,   3.91249f,  1.66373f,  1.13184f,
+  -2.22874f, -1.13448f, -0.11185f, 0.19387f,   0.36770f,  -0.58933f, 0.22789f,
+  1.17307f,  0.77461f,  0.20817f,  0.33417f,   0.54037f,  0.32961f,  -0.18456f,
+  -9.78171f, -0.17216f, -3.44703f, -2.42158f,  0.51946f,  4.35949f,  -0.73335f,
+  -1.61515f, -0.29622f, -0.37617f, -0.42316f,  0.74922f,  1.44386f,  3.92704f,
+  -3.76274f, 4.19775f,  -3.86958f, 0.00074f,   -0.02418f, -0.12944f, 0.05857f,
+  -0.85507f, 5.42546f,  5.40338f,  5.54347f,   5.59791f,  -0.01611f, 0.01618f,
+  -0.01654f, -0.00270f, -0.39608f, -0.40410f,  -0.24551f, 0.09124f,  -0.34413f,
+  -0.11504f, 0.12793f,  -0.31523f, 0.09148f,   -0.08567f, -0.05140f, -0.13310f,
+  -0.81200f, 0.06882f,  -0.52537f, -12.74048f, -0.45395f, -4.04775f, -1.84887f,
+  -1.02573f, 0.32788f,  1.06828f,  -1.25503f,  -0.42693f, 2.01413f,  -2.29103f,
+  0.62271f,  1.11764f,  -1.83113f, -1.32325f,  -1.65651f, -2.87826f, 1.46910f,
+  0.60885f,  0.16079f,  0.00171f,  -0.25658f,  -0.25465f, -0.14149f, 0.19497f,
+  -0.07866f, -0.37080f, -0.05778f, -0.08870f,  -0.20491f, 0.84521f,  -0.18214f,
+  -1.38441f, -1.08932f, -1.76627f, 0.73172f,   0.05967f,  1.28057f,  3.42722f,
+  1.69287f,  0.77169f,  0.44528f,  1.85513f,   0.07840f,  1.31252f,  2.89948f,
+  1.49489f,  0.15281f,  0.54708f,  -1.14185f,  -2.51063f, 0.36618f,  -0.55322f,
+  0.96671f,  1.59470f,  1.38252f,  1.99697f,   0.03266f,  -0.23200f, -0.01127f,
+  -0.18918f, -0.37598f, -0.03119f, -0.36039f,  -0.21192f, -0.11565f, -4.22635f,
+  1.41252f,  0.56608f,  -0.08867f, 3.11924f,   -0.54597f, -0.12504f, -0.05289f,
+  -0.28665f, -0.58297f, -1.18362f, -0.76201f,  -1.22011f, -0.58756f, 0.14740f,
+  1.43971f,  0.98381f,  -0.02998f, -0.40678f,  -0.23047f, -0.12979f, 0.04003f,
+  -0.22081f, -0.09294f, -0.15955f, -0.10379f,  -0.10192f, -1.51316f, 2.39482f,
+  -1.69975f, 3.58976f,  -0.91032f, -0.03498f,  0.48982f,  -0.13418f, 0.76256f,
+  1.61003f,  -2.01676f, -1.24430f, -3.25763f,  1.12314f,  2.00740f,  0.04613f,
+  -0.14746f, -0.57374f, 3.44511f,  -0.56767f,  -4.08432f, -2.04894f, 2.35951f,
+  -0.00458f, 0.18512f,  0.09916f,  -0.04084f,  -1.56207f, 1.38034f,  4.17302f,
+  -1.47326f, -2.03530f, -0.00210f, 0.27469f,   -0.17423f, 0.86860f,  2.76195f,
+  2.43269f,  -3.57331f, 2.08715f,  -1.44171f,  -0.17389f, 2.26157f,  -0.07852f,
+  2.02519f,
+};
+
+static const float av1_rect_partition_nn_bias_128_layer0[NUM_NODES] = {
+  2.53427f,  1.66678f,  -0.84914f, -0.15070f, -1.74769f, 0.45218f,  -0.26067f,
+  2.05916f,  0.08978f,  5.30984f,  2.66243f,  -1.62740f, 0.70018f,  1.96403f,
+  -4.97152f, -0.05425f, -3.84474f, -1.28006f, 3.47490f,  -0.08373f, 0.00225f,
+  -1.40692f, -0.27569f, -0.30253f, 0.77377f,  -0.67636f, -0.26379f, 1.82348f,
+  0.66120f,  0.61119f,  -1.42293f, 0.32676f,
+};
+
+static const float av1_rect_partition_nn_weights_128_layer1[NUM_NODES *
+                                                            LABEL_SIZE] = {
+  1.53453f,  -0.23707f, 7.88368f,  0.33340f,  0.97523f,  1.38538f,  -0.16746f,
+  4.42070f,  3.18678f,  -5.03545f, -2.27029f, -3.75719f, -0.26850f, -4.93432f,
+  -8.75673f, 0.27398f,  -5.77882f, -0.91616f, -2.62725f, -0.23961f, 0.31249f,
+  3.32134f,  0.25375f,  -0.00394f, 2.30213f,  -0.14183f, 0.14544f,  -1.42830f,
+  1.31101f,  3.99389f,  -0.00017f, -2.90184f, -2.11444f, 2.16734f,  -3.05133f,
+  0.39206f,  4.61489f,  -2.88181f, -0.47745f, 2.86649f,  -1.20621f, 3.70550f,
+  1.58029f,  -4.58731f, -2.29350f, -0.76930f, 5.19135f,  -0.22521f, -5.08782f,
+  2.17316f,  1.30563f,  0.16777f,  -2.17767f, -2.09904f, 1.37001f,  0.25091f,
+  -1.76743f, 1.57940f,  0.30544f,  -2.39895f, -0.08532f, -1.77122f, 1.84010f,
+  -0.88449f, 0.79299f,  -1.35368f, -4.54110f, 0.02244f,  -5.11580f, 1.60883f,
+  0.29352f,  -6.47042f, -1.81426f, 1.24013f,  0.90980f,  7.93977f,  2.12555f,
+  5.24720f,  4.19508f,  0.21499f,  11.06045f, -0.74752f, 0.89396f,  0.26422f,
+  1.72332f,  -1.25113f, -1.71136f, 0.13676f,  -0.07867f, -0.96929f, 0.19911f,
+  3.58233f,  -0.76470f, -2.24162f, -2.87465f, 3.18736f,
+};
+
+static const float av1_rect_partition_nn_bias_128_layer1[3] = {
+  1.09014f,
+  -0.53317f,
+  -0.55668f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_128 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_128_layer0,
+    av1_rect_partition_nn_weights_128_layer1 },
+  { av1_rect_partition_nn_bias_128_layer0,
+    av1_rect_partition_nn_bias_128_layer1 }
+};
+#undef FEATURE_SIZE
+#undef NUM_NODES
+#undef LABEL_SIZE
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_
+#endif  // AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c
index 461c3af83..c5508e25c 100644
--- a/third_party/aom/av1/encoder/picklpf.c
+++ b/third_party/aom/av1/encoder/picklpf.c
@@ -70,7 +70,7 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
       // TODO(any): please enable multi-thread and remove the flag when loop
       // filter mask is compatible with multi-thread.
 #if LOOP_FILTER_BITMASK
-  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane,
+  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, 0, plane,
                         plane + 1, partial_frame);
 #else
   if (cpi->num_workers > 1)
@@ -193,6 +193,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
   (void)sd;
 
   lf->sharpness_level = 0;
+  cpi->td.mb.rdmult = cpi->rd.RDMULT;
 
   if (method == LPF_PICK_MINIMAL_LPF) {
     lf->filter_level[0] = 0;
diff --git a/third_party/aom/av1/encoder/picklpf.h b/third_party/aom/av1/encoder/picklpf.h
index 2a168358e..357097ae1 100644
--- a/third_party/aom/av1/encoder/picklpf.h
+++ b/third_party/aom/av1/encoder/picklpf.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_PICKLPF_H_
-#define AV1_ENCODER_PICKLPF_H_
+#ifndef AOM_AV1_ENCODER_PICKLPF_H_
+#define AOM_AV1_ENCODER_PICKLPF_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -27,4 +27,4 @@ void av1_pick_filter_level(const struct yv12_buffer_config *sd,
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_PICKLPF_H_
+#endif  // AOM_AV1_ENCODER_PICKLPF_H_
diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c
index 28b693b08..e7804f6b4 100644
--- a/third_party/aom/av1/encoder/pickrst.c
+++ b/third_party/aom/av1/encoder/pickrst.c
@@ -15,6 +15,7 @@
 #include <math.h>
 
 #include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/binary_codes_writer.h"
@@ -22,7 +23,6 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
-
 #include "av1/common/onyxc_int.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/restoration.h"
@@ -181,6 +181,77 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
   return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd);
 }
 
+int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height,
+                                     int src_stride, const uint8_t *dat8,
+                                     int dat_stride, int32_t *flt0,
+                                     int flt0_stride, int32_t *flt1,
+                                     int flt1_stride, int xq[2],
+                                     const sgr_params_type *params) {
+  int i, j;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  int64_t err = 0;
+  if (params->r[0] > 0 && params->r[1] > 0) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
+        assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
+        const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+        int32_t v = u << SGRPROJ_PRJ_BITS;
+        v += xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u);
+        const int32_t e =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+    }
+  } else if (params->r[0] > 0) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
+        const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+        int32_t v = u << SGRPROJ_PRJ_BITS;
+        v += xq[0] * (flt0[j] - u);
+        const int32_t e =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+    }
+  } else if (params->r[1] > 0) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
+        const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+        int32_t v = u << SGRPROJ_PRJ_BITS;
+        v += xq[1] * (flt1[j] - u);
+        const int32_t e =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt1 += flt1_stride;
+    }
+  } else {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        const int32_t e = (int32_t)(dat[j]) - src[j];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+  }
+
+  return err;
+}
+
 static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
                                     int src_stride, const uint8_t *dat8,
                                     int dat_stride, int use_highbitdepth,
@@ -192,21 +263,9 @@ static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
   int xq[2];
   decode_xq(xqd, xq, params);
   if (!use_highbitdepth) {
-    const uint8_t *src = src8;
-    const uint8_t *dat = dat8;
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j < width; ++j) {
-        const int32_t u =
-            (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
-        int32_t v = u << SGRPROJ_PRJ_BITS;
-        if (params->r[0] > 0) v += xq[0] * (flt0[i * flt0_stride + j] - u);
-        if (params->r[1] > 0) v += xq[1] * (flt1[i * flt1_stride + j] - u);
-        const int32_t e =
-            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) -
-            src[i * src_stride + j];
-        err += e * e;
-      }
-    }
+    err = av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                     dat_stride, flt0, flt0_stride, flt1,
+                                     flt1_stride, xq, params);
   } else {
     const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
@@ -463,9 +522,11 @@ static void apply_sgr(int sgr_params_idx, const uint8_t *dat8, int width,
     // Iterate over the stripe in blocks of width pu_width
     for (int j = 0; j < width; j += pu_width) {
       const int w = AOMMIN(pu_width, width - j);
-      av1_selfguided_restoration(dat8_row + j, w, h, dat_stride, flt0_row + j,
-                                 flt1_row + j, flt_stride, sgr_params_idx,
-                                 bit_depth, use_highbd);
+      const int ret = av1_selfguided_restoration(
+          dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j,
+          flt_stride, sgr_params_idx, bit_depth, use_highbd);
+      (void)ret;
+      assert(!ret);
     }
   }
 }
@@ -588,22 +649,9 @@ static void search_sgrproj(const RestorationTileLimits *limits,
   if (cost_sgr < cost_none) rsc->sgrproj = rusi->sgrproj;
 }
 
-static double find_average(const uint8_t *src, int h_start, int h_end,
-                           int v_start, int v_end, int stride) {
-  uint64_t sum = 0;
-  double avg = 0;
-  int i, j;
-  aom_clear_system_state();
-  for (i = v_start; i < v_end; i++)
-    for (j = h_start; j < h_end; j++) sum += src[i * stride + j];
-  avg = (double)sum / ((v_end - v_start) * (h_end - h_start));
-  return avg;
-}
-
-static void compute_stats(int wiener_win, const uint8_t *dgd,
-                          const uint8_t *src, int h_start, int h_end,
-                          int v_start, int v_end, int dgd_stride,
-                          int src_stride, double *M, double *H) {
+void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
+                         int h_start, int h_end, int v_start, int v_end,
+                         int dgd_stride, int src_stride, double *M, double *H) {
   int i, j, k, l;
   double Y[WIENER_WIN2];
   const int wiener_win2 = wiener_win * wiener_win;
@@ -626,8 +674,7 @@ static void compute_stats(int wiener_win, const uint8_t *dgd,
       assert(idx == wiener_win2);
       for (k = 0; k < wiener_win2; ++k) {
         M[k] += Y[k] * X;
-        H[k * wiener_win2 + k] += Y[k] * Y[k];
-        for (l = k + 1; l < wiener_win2; ++l) {
+        for (l = k; l < wiener_win2; ++l) {
           // H is a symmetric matrix, so we only need to fill out the upper
           // triangle here. We can copy it down to the lower triangle outside
           // the (i, j) loops.
@@ -1073,9 +1120,9 @@ static void search_wiener(const RestorationTileLimits *limits,
                          limits->h_start, limits->h_end, limits->v_start,
                          limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
   } else {
-    compute_stats(wiener_win, rsc->dgd_buffer, rsc->src_buffer, limits->h_start,
-                  limits->h_end, limits->v_start, limits->v_end,
-                  rsc->dgd_stride, rsc->src_stride, M, H);
+    av1_compute_stats(wiener_win, rsc->dgd_buffer, rsc->src_buffer,
+                      limits->h_start, limits->h_end, limits->v_start,
+                      limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
   }
 
   const MACROBLOCK *const x = rsc->x;
@@ -1266,6 +1313,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
   // problem, as these elements are ignored later, but in order to quiet
   // Valgrind's warnings we initialise the array below.
   memset(rusi, 0, sizeof(*rusi) * ntiles[0]);
+  cpi->td.mb.rdmult = cpi->rd.RDMULT;
 
   RestSearchCtxt rsc;
   const int plane_start = AOM_PLANE_Y;
diff --git a/third_party/aom/av1/encoder/pickrst.h b/third_party/aom/av1/encoder/pickrst.h
index 179b89ff9..3fec0c34b 100644
--- a/third_party/aom/av1/encoder/pickrst.h
+++ b/third_party/aom/av1/encoder/pickrst.h
@@ -8,22 +8,39 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AV1_ENCODER_PICKRST_H_
-#define AV1_ENCODER_PICKRST_H_
+#ifndef AOM_AV1_ENCODER_PICKRST_H_
+#define AOM_AV1_ENCODER_PICKRST_H_
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #include "av1/encoder/encoder.h"
+#include "aom_ports/system_state.h"
 
 struct yv12_buffer_config;
 struct AV1_COMP;
 
+static const uint8_t g_shuffle_stats_data[16] = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+};
+
+static INLINE double find_average(const uint8_t *src, int h_start, int h_end,
+                                  int v_start, int v_end, int stride) {
+  uint64_t sum = 0;
+  double avg = 0;
+  int i, j;
+  aom_clear_system_state();
+  for (i = v_start; i < v_end; i++)
+    for (j = h_start; j < h_end; j++) sum += src[i * stride + j];
+  avg = (double)sum / ((v_end - v_start) * (h_end - h_start));
+  return avg;
+}
+
 void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_PICKRST_H_
+#endif  // AOM_AV1_ENCODER_PICKRST_H_
diff --git a/third_party/aom/av1/encoder/pustats.h b/third_party/aom/av1/encoder/pustats.h
index 42a4c590b..40dd46768 100644
--- a/third_party/aom/av1/encoder/pustats.h
+++ b/third_party/aom/av1/encoder/pustats.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_PUSTATS_H_
-#define AV1_ENCODER_PUSTATS_H_
+#ifndef AOM_AV1_ENCODER_PUSTATS_H_
+#define AOM_AV1_ENCODER_PUSTATS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -18,83 +18,78 @@ extern "C" {
 
 #include "av1/encoder/ml.h"
 
-#define NUM_FEATURES 11
+#define NUM_FEATURES_PUSTATS 8
 #define NUM_HIDDEN_LAYERS 2
 #define HIDDEN_LAYERS_0_NODES 12
 #define HIDDEN_LAYERS_1_NODES 10
 #define LOGITS_NODES 1
 
 static const float
-    av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES *
+    av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS *
                                           HIDDEN_LAYERS_0_NODES] = {
-      21.5067f,  22.6709f,  0.0049f,   0.9288f,  -0.0100f,  0.0060f,   -0.0071f,
-      -0.0085f,  0.0348f,   -0.1273f,  10.1154f, 6.3405f,   7.8589f,   -0.0652f,
-      -4.6352f,  0.0445f,   -3.2748f,  0.1025f,  -0.0385f,  -0.4505f,  1.1320f,
-      3.2634f,   23.2420f,  -7.9056f,  0.0522f,  -18.1555f, 0.0977f,   0.1155f,
-      -0.0138f,  0.0267f,   -0.3992f,  0.2735f,  22.8063f,  35.1043f,  3.8140f,
-      -0.0295f,  0.0771f,   -0.6938f,  0.0302f,  -0.0266f,  0.0989f,   -0.0794f,
-      0.2981f,   33.3333f,  -24.1150f, 1.4986f,  -0.0975f,  -15.3938f, -0.0858f,
-      -0.0845f,  -0.0869f,  -0.0858f,  0.3542f,  0.0155f,   -18.2629f, 9.6688f,
-      -11.9643f, -0.2904f,  -5.3026f,  -0.1011f, -0.1202f,  0.0127f,   -0.0269f,
-      0.3434f,   0.0595f,   16.6800f,  41.4730f, 6.9269f,   -0.0512f,  -1.4540f,
-      0.0468f,   0.0077f,   0.0983f,   0.1265f,  -0.5234f,  0.9477f,   36.6470f,
-      -0.4838f,  -0.2269f,  -0.1143f,  -0.3907f, -0.5005f,  -0.0179f,  -0.1057f,
-      0.1233f,   -0.4412f,  -0.0474f,  0.1140f,  -21.6813f, -0.9077f,  -0.0078f,
-      -3.3306f,  0.0417f,   0.0412f,   0.0427f,  0.0418f,   -0.1699f,  0.0072f,
-      -22.3335f, 16.1203f,  -10.1220f, -0.0019f, 0.0005f,   -0.0054f,  -0.0155f,
-      -0.0302f,  -0.0379f,  0.1276f,   0.1568f,  21.6175f,  12.2919f,  11.0327f,
-      -0.2000f,  -8.6691f,  -0.5593f,  -0.5952f, -0.4203f,  -0.4857f,  -1.1239f,
-      3.1404f,   -13.1098f, -5.9165f,  22.2060f, -0.0312f,  -3.9642f,  -0.0344f,
-      -0.0656f,  -0.0273f,  -0.0465f,  0.1412f,  -6.1974f,  9.3661f,
+      -0.1758f, -0.0499f, -10.0069f, -2.2838f,  -0.3359f,  0.3459f,  -0.3285f,
+      -0.0515f, -0.5417f, 0.2357f,   -0.0575f,  -69.0782f, 0.5348f,  1.4068f,
+      0.2213f,  -1.0490f, -0.0636f,  0.1654f,   1.1002f,   33.4924f, 0.4358f,
+      1.2499f,  0.1143f,  0.0592f,   -1.6335f,  -0.0092f,  1.2207f,  -28.4543f,
+      -0.4973f, 0.4368f,  0.2341f,   -0.1623f,  -3.8986f,  0.1311f,  -1.8789f,
+      -3.9079f, -0.8158f, -0.8420f,  1.4295f,   -2.3629f,  -1.4825f, 0.6498f,
+      -5.3669f, 6.4434f,  1.8393f,   -35.0678f, 3.7459f,   -2.8504f, 2.0502f,
+      -0.1812f, -3.9011f, -1.0155f,  1.8375f,   -1.4517f,  1.3917f,  3.8664f,
+      0.8345f,  -0.3472f, 5.7740f,   -1.1196f,  -0.3264f,  -1.2481f, -0.9284f,
+      -4.9657f, 2.2831f,  0.7337f,   2.3176f,   0.6416f,   0.8804f,  1.9988f,
+      -1.3426f, 1.2728f,  1.2249f,   -0.1551f,  5.6045f,   0.2046f,  -2.1464f,
+      -2.4922f, -0.5334f, 12.1055f,  7.2467f,   -0.0070f,  0.0234f,  0.0021f,
+      0.0215f,  -0.0098f, -0.0682f,  -6.1494f,  -0.3176f,  -1.6069f, -0.2119f,
+      -1.0533f, -0.3566f, 0.5294f,   -0.4335f,  0.1626f,
     };
 
 static const float av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
     {
-      -14.3065f, 2.059f,   -62.9916f, -50.1209f, 57.643f,  -59.3737f,
-      -30.4737f, -0.1112f, 72.5427f,  55.402f,   24.9523f, 18.5834f,
+      10.5266f, 5.3268f, -1.0678f, 7.7411f,  8.7164f,  -0.3235f,
+      7.3028f,  9.0874f, -6.4594f, -1.0102f, -1.1146f, 10.8419f,
     };
 
 static const float
     av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
                                           HIDDEN_LAYERS_1_NODES] = {
-      0.3883f,  -0.2784f, -0.2850f, 0.4894f,  -2.2450f, 0.4511f,  -0.1969f,
-      -0.0077f, -1.4924f, 0.1138f,  -2.9848f, 1.0211f,  -0.1712f, -0.1952f,
-      -0.4774f, 0.0761f,  -0.3186f, -0.1002f, 0.8663f,  0.5026f,  1.1920f,
-      0.9337f,  0.3911f,  -0.3841f, -0.0037f, 0.7295f,  -0.3183f, 0.1829f,
-      -1.3670f, -0.1046f, 0.6629f,  0.0619f,  -0.1551f, 0.8174f,  2.1521f,
-      -1.3323f, -0.0527f, -0.5772f, 0.2001f,  -0.6270f, -1.0625f, 0.3342f,
-      0.6676f,  0.4605f,  -2.0049f, 0.7781f,  0.0713f,  -0.0824f, -0.4529f,
-      0.1757f,  -0.1338f, -0.2319f, -0.2864f, 0.1248f,  0.3887f,  -0.1676f,
-      1.8422f,  0.6435f,  1.2123f,  -0.5667f, -0.2423f, -0.0314f, 0.2411f,
-      -0.5013f, 0.0422f,  0.2559f,  0.4435f,  -0.1223f, 1.5167f,  0.3939f,
-      1.0898f,  0.0795f,  -0.9251f, -0.0813f, -0.5929f, -0.0741f, 4.0687f,
-      -0.4368f, -0.0984f, 0.0837f,  3.6169f,  0.0662f,  -0.1679f, -0.8090f,
-      -0.2610f, -0.5791f, 0.0642f,  -0.2979f, -0.9036f, 0.2898f,  0.3265f,
-      0.4660f,  -1.6358f, -0.0347f, 0.1087f,  0.0353f,  0.5687f,  -0.5242f,
-      -0.4895f, 0.7693f,  -1.3829f, -0.2244f, -0.2880f, 0.0575f,  2.0563f,
-      -0.2322f, -1.1597f, 1.6125f,  -0.0925f, 1.3540f,  0.1432f,  0.3993f,
-      -0.0303f, -1.1438f, -1.7323f, -0.4329f, 2.9443f,  -0.5724f, 0.0122f,
-      -1.0829f,
+      10.5932f,  2.5192f,  -0.0015f, 5.9479f,   5.2426f,   -0.4091f, 5.3220f,
+      6.0469f,   0.7200f,  3.3241f,  5.5006f,   12.8290f,  -1.6396f, 0.5743f,
+      -0.8370f,  1.9956f,  -4.9270f, -1.5295f,  2.1350f,   -9.4415f, -0.7094f,
+      5.1822f,   19.7287f, -3.0444f, -0.3320f,  0.0031f,   -0.2709f, -0.5249f,
+      0.3281f,   -0.2240f, 0.2225f,  -0.2386f,  -0.4370f,  -0.2438f, -0.4928f,
+      -0.2842f,  -2.1772f, 9.2570f,  -17.6655f, 3.5448f,   -2.8394f, -1.0167f,
+      -0.5115f,  -1.9260f, -0.2111f, -0.7528f,  -1.2387f,  -0.0401f, 5.0716f,
+      -3.3763f,  -0.2898f, -0.4956f, -7.9993f,  0.1526f,   -0.0242f, 0.7354f,
+      6.0432f,   4.8043f,  7.4790f,  -0.6295f,  1.7565f,   3.7197f,  -2.3963f,
+      6.8945f,   2.9717f,  -3.1623f, 3.4241f,   4.4676f,   -1.8154f, -2.9401f,
+      -8.5657f,  -3.0240f, -1.4661f, 8.1145f,   -12.7858f, 3.3624f,  -1.0819f,
+      -4.2856f,  1.1801f,  -0.5587f, -1.6062f,  -1.1813f,  -3.5882f, -0.2490f,
+      -24.9566f, -0.4140f, -0.1113f, 3.5537f,   4.4112f,   0.1367f,  -1.5876f,
+      1.6605f,   1.3903f,  -0.0253f, -2.1419f,  -2.2197f,  -0.7659f, -0.4249f,
+      -0.0424f,  0.1486f,  0.4643f,  -0.9068f,  -0.3619f,  -0.7624f, -0.9132f,
+      -0.4947f,  -0.3527f, -0.5445f, -0.4768f,  -1.7761f,  -1.0686f, 0.5462f,
+      1.3371f,   4.3116f,  0.0777f,  -2.7216f,  -1.8908f,  3.4989f,  7.7269f,
+      -2.7566f,
     };
 
 static const float av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
     {
-      -10.3717f, 37.304f,  -36.7221f, -52.7572f, 44.0877f,
-      41.1631f,  36.3299f, -48.6087f, -4.5189f,  13.0611f,
+      13.2435f, -8.5477f, -0.0998f, -1.5131f, -12.0187f,
+      6.1715f,  0.5094f,  7.6433f,  -0.3992f, -1.3555f,
     };
 
 static const float
     av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
-      0.8362f, 1.0615f, -1.5178f, -1.2959f, 1.3233f,
-      1.4909f, 1.3554f, -0.8626f, -0.618f,  -0.9458f,
+      4.3078f, -17.3497f, 0.0195f,  34.6032f, -5.0127f,
+      5.3079f, 10.0077f,  -13.129f, 0.0087f,  -8.4009f,
     };
 
 static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = {
-  30.6878f,
+  4.5103f,
 };
 
 static const NN_CONFIG av1_pustats_rate_nnconfig = {
-  NUM_FEATURES,                                      // num_inputs
+  NUM_FEATURES_PUSTATS,                              // num_inputs
   LOGITS_NODES,                                      // num_outputs
   NUM_HIDDEN_LAYERS,                                 // num_hidden_layers
   { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES },  // num_hidden_nodes
@@ -111,76 +106,71 @@ static const NN_CONFIG av1_pustats_rate_nnconfig = {
 };
 
 static const float
-    av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES *
+    av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS *
                                           HIDDEN_LAYERS_0_NODES] = {
-      0.7770f,   1.0881f,  0.0177f,  0.4939f,  -0.2541f, -0.2672f, -0.1705f,
-      -0.1940f,  -0.6395f, 1.2928f,  3.6240f,  2.4445f,  1.6790f,  0.0265f,
-      0.1897f,   0.1776f,  0.0422f,  0.0197f,  -0.0466f, 0.0462f,  -1.0827f,
-      2.0231f,   1.8044f,  2.7022f,  0.0064f,  0.2255f,  -0.0552f, -0.1010f,
-      -0.0581f,  -0.0781f, 0.2614f,  -3.4085f, 1.7478f,  0.1155f,  -0.1458f,
-      -0.0031f,  -0.1797f, -0.4378f, -0.0539f, 0.0607f,  -0.1347f, -0.3142f,
-      -0.2014f,  -0.4484f, -0.2808f, 1.5913f,  0.0046f,  -0.0610f, -0.6479f,
-      -0.7278f,  -0.5592f, -0.6695f, -0.8120f, 2.9056f,  -1.1501f, 9.3618f,
-      4.2486f,   0.0011f,  -0.1499f, -0.0834f, 0.1282f,  0.0409f,  0.1670f,
-      -0.1398f,  -0.4661f, 13.7700f, 8.2061f,  -0.0685f, 0.0061f,  -0.2951f,
-      0.0169f,   0.0520f,  0.0040f,  0.0374f,  0.0467f,  -0.0107f, 14.2664f,
-      -2.2489f,  -0.2516f, -0.0061f, -0.9921f, 0.1223f,  0.1212f,  0.1199f,
-      0.1185f,   -0.4867f, 0.0325f,  -5.0757f, -8.7853f, 1.0450f,  0.0169f,
-      0.5462f,   0.0051f,  0.1330f,  0.0143f,  0.1429f,  -0.0258f, 0.2769f,
-      -12.8839f, 22.3093f, 1.2761f,  0.0037f,  -1.2459f, -0.0466f, 0.0003f,
-      -0.0464f,  -0.0067f, 0.2361f,  0.0355f,  23.3833f, 10.9218f, 2.6811f,
-      0.0222f,   -1.1055f, 0.1825f,  0.0575f,  0.0114f,  -0.1259f, 0.3148f,
-      -2.0047f,  11.9559f, 5.7375f,  0.8802f,  0.0042f,  -0.2469f, -0.1040f,
-      -1.5679f,  0.1969f,  -0.0184f, 0.0157f,  0.6688f,  3.4492f,
+      -0.2560f, 0.1105f,  -0.8434f, -0.0132f, -8.9371f, -1.1176f, -0.3655f,
+      0.4885f,  1.7518f,  0.4985f,  0.5582f,  -0.3739f, 0.9403f,  0.3874f,
+      0.3265f,  1.7383f,  3.1747f,  0.0285f,  3.3942f,  -0.0123f, 0.5057f,
+      0.1584f,  0.2697f,  4.6151f,  3.6251f,  -0.0121f, -1.0047f, -0.0037f,
+      0.0127f,  0.1935f,  -0.5277f, -2.7144f, 0.0729f,  -0.1457f, -0.0816f,
+      -0.5462f, 0.4738f,  0.3599f,  -0.0564f, 0.0910f,  0.0126f,  -0.0310f,
+      -2.1311f, -0.4666f, -0.0074f, -0.0765f, 0.0287f,  -0.2662f, -0.0999f,
+      -0.2983f, -0.4899f, -0.2314f, 0.2873f,  -0.3614f, 0.1783f,  -0.1210f,
+      0.3569f,  0.5436f,  -8.0536f, -0.0044f, -1.5255f, -0.8247f, -0.4556f,
+      1.9045f,  0.5463f,  0.1102f,  -0.9293f, -0.0185f, -0.8302f, -0.4378f,
+      -0.3531f, -1.3095f, 0.6099f,  0.7977f,  4.1950f,  -0.0067f, -0.2762f,
+      -0.1574f, -0.2149f, 0.6104f,  -1.7053f, 0.1904f,  4.2402f,  -0.2671f,
+      0.8940f,  0.6820f,  0.2241f,  -0.9459f, 1.4571f,  0.5255f,  2.3352f,
+      -0.0806f, 0.5231f,  0.3928f,  0.4146f,  2.0956f,
     };
 
 static const float av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
     {
-      4.5051f,  -4.5858f, 1.4693f, 0.f,      3.7968f, -3.6292f,
-      -7.3112f, 10.9743f, 8.027f,  -2.2692f, -8.748f, -1.3689f,
+      1.1597f, 0.0836f, -0.7471f, -0.2439f, -0.0438f, 2.4626f,
+      0.f,     1.1485f, 2.7085f,  -4.7897f, 1.4093f,  -1.657f,
     };
 
 static const float
     av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
                                           HIDDEN_LAYERS_1_NODES] = {
-      -0.0182f, -0.0925f, -0.0311f, -0.2962f, 0.1177f,  -0.0027f, -0.2136f,
-      -1.2094f, 0.0935f,  -0.1403f, -0.1477f, -0.0752f, 0.1519f,  -0.4726f,
-      -0.3521f, 0.4199f,  -0.0168f, -0.2927f, -0.2510f, 0.0706f,  -0.2920f,
-      0.2046f,  -0.0400f, -0.2114f, 0.4240f,  -0.7070f, 0.4964f,  0.4471f,
-      0.3841f,  -0.0918f, -0.6140f, 0.6056f,  -0.1123f, 0.3944f,  -0.0178f,
-      -1.7702f, -0.4434f, 0.0560f,  0.1565f,  -0.0793f, -0.0041f, 0.0052f,
-      -0.1843f, 0.2400f,  -0.0605f, 0.3196f,  -0.0286f, -0.0002f, -0.0595f,
-      -0.0493f, -0.2636f, -0.3994f, -0.1871f, -0.3298f, -0.0788f, -1.0685f,
-      0.1900f,  -0.5549f, -0.1350f, -0.0153f, -0.1195f, -0.5874f, 1.0468f,
-      0.0212f,  -0.2306f, -0.2677f, -0.3000f, -1.0702f, -0.1725f, -0.0656f,
-      -0.0226f, 0.0616f,  -0.3453f, 0.0810f,  0.4838f,  -0.3780f, -1.4486f,
-      0.7777f,  -0.0459f, -0.6568f, 0.0589f,  -1.0286f, -0.6001f, 0.0826f,
-      0.4794f,  -0.0586f, -0.1759f, 0.3811f,  -0.1313f, 0.3829f,  -0.0968f,
-      -2.0445f, -0.3566f, -0.1491f, -0.0745f, -0.0202f, 0.0839f,  0.0470f,
-      -0.2432f, 0.3013f,  -0.0743f, -0.3479f, 0.0749f,  -5.2490f, 0.0209f,
-      -0.1653f, -0.0826f, -0.0535f, 0.3225f,  -0.3786f, -0.0104f, 0.3091f,
-      0.3652f,  0.1757f,  -0.3252f, -1.1022f, -0.0574f, -0.4473f, 0.3469f,
-      -0.5539f,
+      -0.5203f, -1.3468f, 0.3865f,  -0.6859f, 0.0058f,  4.0682f,  0.4807f,
+      -0.1380f, 0.6050f,  0.8958f,  0.7748f,  -0.1311f, 1.7317f,  1.1265f,
+      0.0827f,  0.1407f,  -0.3605f, 0.5429f,  0.1880f,  -0.1439f, 0.2837f,
+      1.6477f,  0.0832f,  0.0593f,  -1.8464f, -0.7241f, -1.0672f, -0.3546f,
+      -0.3842f, -2.3637f, 0.2514f,  0.8263f,  -0.1872f, 0.5774f,  -0.3610f,
+      -0.0205f, 1.3977f,  -0.1083f, 0.6923f,  1.3039f,  -0.2870f, 1.0622f,
+      -0.0566f, 0.2697f,  -0.5429f, -0.6193f, 1.7559f,  0.3246f,  1.9159f,
+      0.3744f,  0.0686f,  1.0191f,  -0.4212f, 1.9591f,  -0.0691f, -0.1085f,
+      -1.2034f, 0.0606f,  1.0116f,  0.5565f,  -0.1874f, -0.7898f, 0.4796f,
+      0.2290f,  0.4334f,  -0.5817f, -0.2949f, 0.1367f,  -0.2932f, -1.1265f,
+      0.0133f,  -0.5309f, -3.3191f, 0.0939f,  0.3895f,  -2.5812f, -0.0066f,
+      -3.0063f, -0.2982f, 0.7309f,  -0.2422f, -0.2770f, -0.7152f, 0.1700f,
+      1.9630f,  0.1988f,  0.4194f,  0.8762f,  0.3402f,  0.1051f,  -0.1598f,
+      0.2405f,  0.0392f,  1.1256f,  1.5245f,  0.0950f,  0.2160f,  -0.5023f,
+      0.2584f,  0.2074f,  0.2218f,  0.3966f,  -0.0921f, -0.2435f, -0.4560f,
+      -1.1923f, -0.3716f, -0.3286f, -1.3225f, 0.1896f,  -0.3342f, -0.7888f,
+      -0.4488f, -1.7168f, 0.3341f,  0.1146f,  0.5226f,  0.2610f,  -0.4574f,
+      -0.4164f,
     };
 
 static const float av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
     {
-      11.9337f, -0.3681f, -6.1324f,  12.674f,  9.0956f,
-      4.6069f,  -4.4158f, -12.4848f, 10.8473f, 5.7633f,
+      -2.3014f, -2.4292f, 1.3317f, -3.2361f, -1.918f,
+      2.7149f,  -2.5649f, 2.7765f, 2.9617f,  2.7684f,
     };
 
 static const float
     av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
-      0.3245f,  0.2979f,  -0.157f,  -0.1441f, 0.1413f,
-      -0.7496f, -0.1737f, -0.5322f, 0.0748f,  0.2518f,
+      -0.6868f, -0.6715f, 0.449f,  -1.293f, 0.6214f,
+      0.9894f,  -0.4342f, 0.7002f, 1.4363f, 0.6951f,
     };
 
 static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = {
-  4.6065f,
+  2.3371f,
 };
 
 static const NN_CONFIG av1_pustats_dist_nnconfig = {
-  NUM_FEATURES,                                      // num_inputs
+  NUM_FEATURES_PUSTATS,                              // num_inputs
   LOGITS_NODES,                                      // num_outputs
   NUM_HIDDEN_LAYERS,                                 // num_hidden_layers
   { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES },  // num_hidden_nodes
@@ -196,7 +186,6 @@ static const NN_CONFIG av1_pustats_dist_nnconfig = {
   },
 };
 
-#undef NUM_FEATURES
 #undef NUM_HIDDEN_LAYERS
 #undef HIDDEN_LAYERS_0_NODES
 #undef HIDDEN_LAYERS_1_NODES
@@ -206,4 +195,4 @@ static const NN_CONFIG av1_pustats_dist_nnconfig = {
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_PUSTATS_H_
+#endif  // AOM_AV1_ENCODER_PUSTATS_H_
diff --git a/third_party/aom/av1/encoder/random.h b/third_party/aom/av1/encoder/random.h
index 9b2dac965..0bca39102 100644
--- a/third_party/aom/av1/encoder/random.h
+++ b/third_party/aom/av1/encoder/random.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_RANDOM_H_
-#define AV1_ENCODER_RANDOM_H_
+#ifndef AOM_AV1_ENCODER_RANDOM_H_
+#define AOM_AV1_ENCODER_RANDOM_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -26,4 +26,4 @@ static INLINE unsigned int lcg_rand16(unsigned int *state) {
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_RANDOM_H_
+#endif  // AOM_AV1_ENCODER_RANDOM_H_
diff --git a/third_party/aom/av1/encoder/ransac.h b/third_party/aom/av1/encoder/ransac.h
index 1019055ed..c429f2ce5 100644
--- a/third_party/aom/av1/encoder/ransac.h
+++ b/third_party/aom/av1/encoder/ransac.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_RANSAC_H_
-#define AV1_ENCODER_RANSAC_H_
+#ifndef AOM_AV1_ENCODER_RANSAC_H_
+#define AOM_AV1_ENCODER_RANSAC_H_
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -32,4 +32,4 @@ int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion,
 int ransac_translation(int *matched_points, int npoints,
                        int *num_inliers_by_motion, double *params_by_motion,
                        int num_motions);
-#endif  // AV1_ENCODER_RANSAC_H_
+#endif  // AOM_AV1_ENCODER_RANSAC_H_
diff --git a/third_party/aom/av1/encoder/rate_distortion_model_params.h b/third_party/aom/av1/encoder/rate_distortion_model_params.h
index 14d23f10f..7cd0962c5 100644
--- a/third_party/aom/av1/encoder/rate_distortion_model_params.h
+++ b/third_party/aom/av1/encoder/rate_distortion_model_params.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
-#define AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
+#ifndef AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
+#define AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -588,4 +588,4 @@ static const NN_CONFIG av1_rdcost_model_nnconfig = {
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
+#endif  // AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c
index 3aae0144e..2597fb990 100644
--- a/third_party/aom/av1/encoder/ratectrl.c
+++ b/third_party/aom/av1/encoder/ratectrl.c
@@ -117,7 +117,7 @@ static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low,
   for (i = 0; i < QINDEX_RANGE; i++) {
     const double maxq = av1_convert_qindex_to_q(i, bit_depth);
     kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth);
-    kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+    kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth);
     arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth);
     arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
     inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth);
@@ -253,6 +253,9 @@ int av1_rc_get_default_min_gf_interval(int width, int height,
 int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
   int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
   interval += (interval & 0x01);  // Round to even value
+#if CONFIG_FIX_GF_LENGTH
+  interval = AOMMAX(FIXED_GF_LENGTH, interval);
+#endif
   return AOMMAX(interval, min_gf_interval);
 }
 
@@ -299,9 +302,9 @@ void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   rc->avg_q = av1_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth);
 
   for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
-    rc->rate_correction_factors[i] = 1.0;
+    rc->rate_correction_factors[i] = 0.7;
   }
-
+  rc->rate_correction_factors[KF_STD] = 1.0;
   rc->min_gf_interval = oxcf->min_gf_interval;
   rc->max_gf_interval = oxcf->max_gf_interval;
   if (rc->min_gf_interval == 0)
@@ -556,6 +559,14 @@ static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
                             arfgf_low_motion_minq, arfgf_high_motion_minq);
 }
 
+#if REDUCE_LAST_ALT_BOOST
+static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) {
+  int *arfgf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+  return arfgf_high_motion_minq[q];
+}
+#endif
+
 static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const unsigned int curr_frame = cpi->common.current_video_frame;
@@ -918,7 +929,7 @@ int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) {
 #define STATIC_MOTION_THRESH 95
 static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
                                          int height, int *bottom_index,
-                                         int *top_index) {
+                                         int *top_index, int *arf_q) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
@@ -959,7 +970,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
         qindex = rc->last_boosted_qindex;
         last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
         delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
-                                          last_boosted_q * 0.75, bit_depth);
+                                          last_boosted_q * 0.5, bit_depth);
         active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
       }
     } else {
@@ -1000,17 +1011,49 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
     // For constrained quality dont allow Q less than the cq level
     if (oxcf->rc_mode == AOM_CQ) {
       if (q < cq_level) q = cq_level;
+#if USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
+      if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
+          (is_intrl_arf_boost && !cpi->new_bwdref_update_rule)) {
+#endif  // USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
+        active_best_quality = get_gf_active_quality(rc, q, bit_depth);
 
-      active_best_quality = get_gf_active_quality(rc, q, bit_depth);
-
-      // Constrained quality use slightly lower active best.
-      active_best_quality = active_best_quality * 15 / 16;
+        // Constrained quality use slightly lower active best.
+        active_best_quality = active_best_quality * 15 / 16;
+#if REDUCE_LAST_ALT_BOOST
+        if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+          const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+          const int boost = min_boost - active_best_quality;
 
+          active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+        }
+#endif
+        *arf_q = active_best_quality;
+#if USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
+      } else {
+        active_best_quality = rc->arf_q;
+        int this_height = gf_group->pyramid_level[gf_group->index];
+        while (this_height < gf_group->pyramid_height) {
+          active_best_quality = (active_best_quality + cq_level + 1) / 2;
+          ++this_height;
+        }
+      }
+#endif  // USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
     } else if (oxcf->rc_mode == AOM_Q) {
       if (!cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) {
         active_best_quality = cq_level;
       } else {
-        active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+        if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+          active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+          *arf_q = active_best_quality;
+#if REDUCE_LAST_ALT_BOOST
+          const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+          const int boost = min_boost - active_best_quality;
+
+          active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+#endif
+        } else {
+          active_best_quality = rc->arf_q;
+        }
 #if USE_SYMM_MULTI_LAYER
         if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
           int this_height = gf_group->pyramid_level[gf_group->index];
@@ -1030,6 +1073,12 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
       }
     } else {
       active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+#if REDUCE_LAST_ALT_BOOST
+      const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+      const int boost = min_boost - active_best_quality;
+
+      active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+#endif
 #if USE_SYMM_MULTI_LAYER
       if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
         int this_height = gf_group->pyramid_level[gf_group->index];
@@ -1104,7 +1153,8 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
     if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
       q = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
     } else {
-      q = rc->last_boosted_qindex;
+      q = AOMMIN(rc->last_boosted_qindex,
+                 (active_best_quality + active_worst_quality) / 2);
     }
   } else {
     q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
@@ -1129,7 +1179,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
   return q;
 }
 
-int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
+int av1_rc_pick_q_and_bounds(AV1_COMP *cpi, int width, int height,
                              int *bottom_index, int *top_index) {
   int q;
   if (cpi->oxcf.pass == 0) {
@@ -1140,8 +1190,17 @@ int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
       q = rc_pick_q_and_bounds_one_pass_vbr(cpi, width, height, bottom_index,
                                             top_index);
   } else {
+    assert(cpi->oxcf.pass == 2 && "invalid encode pass");
+
+    GF_GROUP *gf_group = &cpi->twopass.gf_group;
+    int arf_q = 0;
+
     q = rc_pick_q_and_bounds_two_pass(cpi, width, height, bottom_index,
-                                      top_index);
+                                      top_index, &arf_q);
+
+    if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+      cpi->rc.arf_q = arf_q;
+    }
   }
 
   return q;
@@ -1327,13 +1386,6 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
     update_golden_frame_stats(cpi);
 
   if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0;
-
-  // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
-  //               differently here for rc->avg_frame_bandwidth.
-  if (cm->show_frame || rc->is_bwd_ref_frame) {
-    rc->frames_since_key++;
-    rc->frames_to_key--;
-  }
   // if (cm->current_video_frame == 1 && cm->show_frame)
   /*
   rc->this_frame_target =
@@ -1635,10 +1687,6 @@ void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi,
     if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
       rc->max_gf_interval = rc->static_scene_max_gf_interval;
 
-#if FIX_GF_INTERVAL_LENGTH
-    rc->max_gf_interval = FIXED_GF_LENGTH + 1;
-#endif
-
     // Clamp min to max
     rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval);
   }
diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h
index f0508da9e..198ecab97 100644
--- a/third_party/aom/av1/encoder/ratectrl.h
+++ b/third_party/aom/av1/encoder/ratectrl.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_RATECTRL_H_
-#define AV1_ENCODER_RATECTRL_H_
+#ifndef AOM_AV1_ENCODER_RATECTRL_H_
+#define AOM_AV1_ENCODER_RATECTRL_H_
 
 #include "aom/aom_codec.h"
 #include "aom/aom_integer.h"
@@ -25,13 +25,27 @@ extern "C" {
 #define BPER_MB_NORMBITS 9
 
 #define CUSTOMIZED_GF 1
-#define FIX_GF_INTERVAL_LENGTH 0
 
-#if FIX_GF_INTERVAL_LENGTH
+#if CONFIG_FIX_GF_LENGTH
 #define FIXED_GF_LENGTH 16
+#define MAX_PYRAMID_LVL 4
+// We allow a frame to have at most two left/right descendants before changing
+// them into to a subtree, i.e., we allow the following structure:
+/*                    OUT_OF_ORDER_FRAME
+                     / /              \ \
+(two left children) F F                F F (two right children) */
+// Therefore the max gf size supported by 4 layer structure is
+// 1 (KEY/OVERLAY) + 1 + 2 + 4 + 16 (two children on both side of their parent)
+#define MAX_PYRAMID_SIZE 24
 #define USE_SYMM_MULTI_LAYER 1
+#define REDUCE_LAST_ALT_BOOST 1
+#define REDUCE_LAST_GF_LENGTH 1
+#define MULTI_LVL_BOOST_VBR_CQ 1
 #else
 #define USE_SYMM_MULTI_LAYER 0
+#define REDUCE_LAST_ALT_BOOST 0
+#define REDUCE_LAST_GF_LENGTH 0
+#define MULTI_LVL_BOOST_VBR_CQ 0
 #endif
 
 #if USE_SYMM_MULTI_LAYER
@@ -159,6 +173,9 @@ typedef struct {
 
   // Auto frame-scaling variables.
   int rf_level_maxq[RATE_FACTOR_LEVELS];
+  float_t arf_boost_factor;
+  // Q index used for ALT frame
+  int arf_q;
 } RATE_CONTROL;
 
 struct AV1_COMP;
@@ -228,7 +245,7 @@ void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi,
                                       int *frame_over_shoot_limit);
 
 // Picks q and q bounds given the target for bits
-int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, int width, int height,
+int av1_rc_pick_q_and_bounds(struct AV1_COMP *cpi, int width, int height,
                              int *bottom_index, int *top_index);
 
 // Estimates q to achieve a target bits per frame
@@ -275,4 +292,4 @@ int av1_resize_one_pass_cbr(struct AV1_COMP *cpi);
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_RATECTRL_H_
+#endif  // AOM_AV1_ENCODER_RATECTRL_H_
diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.c b/third_party/aom/av1/encoder/ratectrl_xiph.c
deleted file mode 100644
index e69de29bb..000000000
diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.h b/third_party/aom/av1/encoder/ratectrl_xiph.h
deleted file mode 100644
index e69de29bb..000000000
diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c
index c4d4777bf..b87d89e50 100644
--- a/third_party/aom/av1/encoder/rd.c
+++ b/third_party/aom/av1/encoder/rd.c
@@ -648,6 +648,473 @@ void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2,
   }
 }
 
+static double interp_cubic(const double *p, double x) {
+  return p[1] + 0.5 * x *
+                    (p[2] - p[0] +
+                     x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] +
+                          x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
+}
+
+static double interp_bicubic(const double *p, int p_stride, double x,
+                             double y) {
+  double q[4];
+  q[0] = interp_cubic(p, x);
+  q[1] = interp_cubic(p + p_stride, x);
+  q[2] = interp_cubic(p + 2 * p_stride, x);
+  q[3] = interp_cubic(p + 3 * p_stride, x);
+  return interp_cubic(q, y);
+}
+
+static const double interp_rgrid_surf[65 * 18] = {
+  0.104019,    0.245714,    0.293686,    0.358635,    0.382167,    0.412446,
+  0.419955,    0.421388,    0.426672,    0.427990,    0.428531,    0.456868,
+  0.569880,    0.638822,    1.016319,    2.143453,    3.565229,    4.720880,
+  0.124618,    0.294211,    0.352023,    0.429991,    0.458206,    0.494510,
+  0.503513,    0.505232,    0.511566,    0.513234,    0.519365,    0.570225,
+  0.697373,    0.840624,    1.462198,    3.289054,    6.256517,    6.852788,
+  0.118630,    0.269669,    0.346620,    0.430999,    0.459385,    0.495783,
+  0.504808,    0.506532,    0.512884,    0.514988,    0.543437,    0.662772,
+  0.795876,    1.313596,    2.403841,    4.163098,    7.440589,    8.616275,
+  0.093329,    0.168205,    0.321320,    0.430607,    0.459385,    0.495783,
+  0.504813,    0.506548,    0.512975,    0.520662,    0.571659,    0.701841,
+  1.010727,    2.138851,    3.460626,    6.317955,    10.098127,   14.418553,
+  0.087021,    0.142905,    0.315011,    0.430509,    0.459385,    0.495787,
+  0.505075,    0.507599,    0.513584,    0.543182,    0.669941,    0.825620,
+  1.362800,    2.572187,    4.205047,    7.498399,    12.303118,   16.641735,
+  0.086923,    0.142513,    0.314913,    0.430508,    0.459385,    0.495803,
+  0.506126,    0.511816,    0.514810,    0.549705,    0.725350,    1.127334,
+  2.168597,    3.463686,    6.318605,    10.162284,   18.556041,   19.847042,
+  0.086923,    0.142513,    0.314913,    0.430506,    0.459376,    0.495805,
+  0.506388,    0.512954,    0.520772,    0.580215,    0.810474,    1.391548,
+  2.579442,    4.205160,    7.498399,    12.381597,   21.703618,   24.015457,
+  0.086923,    0.142513,    0.314911,    0.430353,    0.458765,    0.495652,
+  0.506391,    0.513406,    0.544098,    0.702950,    1.121860,    2.168961,
+  3.463798,    6.318607,    10.162284,   18.685361,   28.188192,   37.638872,
+  0.086923,    0.142513,    0.314901,    0.429742,    0.456313,    0.495045,
+  0.506484,    0.519195,    0.580104,    0.810126,    1.391462,    2.579441,
+  4.205160,    7.498399,    12.381597,   21.848607,   33.367199,   42.623190,
+  0.086923,    0.142513,    0.314899,    0.429589,    0.455706,    0.495155,
+  0.507882,    0.542426,    0.702360,    1.119921,    2.168478,    3.463791,
+  6.318607,    10.162284,   18.685361,   28.345760,   47.802028,   49.163533,
+  0.086924,    0.142548,    0.315086,    0.429842,    0.455870,    0.496336,
+  0.512412,    0.556953,    0.773373,    1.266396,    2.548277,    4.204676,
+  7.498399,    12.381597,   21.848607,   33.548250,   54.301011,   56.262859,
+  0.087067,    0.144957,    0.327436,    0.446616,    0.466362,    0.505706,
+  0.522077,    0.610747,    0.972543,    1.666916,    3.338812,    6.316669,
+  10.162284,   18.685361,   28.345760,   48.065311,   66.145302,   78.396020,
+  0.094295,    0.164235,    0.393722,    0.534219,    0.530922,    0.579308,
+  0.603889,    0.760870,    1.229961,    2.423214,    4.173513,    7.497916,
+  12.381597,   21.848607,   33.548250,   54.589585,   74.875848,   86.468182,
+  0.124096,    0.213005,    0.497188,    0.665176,    0.685973,    0.800200,
+  0.911394,    1.077971,    1.677290,    3.332129,    6.314960,    10.162257,
+  18.685361,   28.345760,   48.065311,   66.453506,   98.275189,   96.862588,
+  0.140999,    0.270140,    0.658212,    0.867661,    0.970183,    1.149516,
+  1.480599,    1.664833,    2.421893,    3.857981,    7.418830,    12.380371,
+  21.848607,   33.548250,   54.589585,   75.188867,   106.657971,  99.762997,
+  0.178353,    0.398001,    0.988462,    1.241473,    1.340967,    1.713568,
+  2.335030,    2.701432,    3.348532,    5.077158,    9.829903,    18.676528,
+  28.345700,   48.065311,   66.453506,   98.588283,   117.057193,  101.130722,
+  0.281079,    0.548300,    1.395825,    1.780770,    2.000508,    2.702964,
+  3.638454,    4.573843,    5.051641,    7.079129,    11.293332,   21.594861,
+  33.544335,   54.589585,   75.188867,   106.971065,  119.957601,  101.466632,
+  0.476762,    0.842189,    2.019678,    2.723895,    3.188467,    4.011610,
+  5.545111,    7.508984,    8.176339,    9.774504,    14.720782,   27.334416,
+  48.049609,   66.453506,   98.588283,   117.370357,  121.329855,  101.509242,
+  0.993999,    1.520111,    3.013605,    4.203530,    4.982992,    6.074944,
+  8.583581,    11.818375,   14.192544,   14.937517,   21.258160,   33.305953,
+  54.585735,   75.188867,   106.971135,  120.279824,  121.976055,  102.690130,
+  1.776487,    2.613655,    4.356487,    6.161726,    7.622196,    9.464193,
+  13.077233,   18.051656,   23.221051,   24.080068,   30.085038,   48.345269,
+  66.457698,   98.588353,   117.379415,  121.976128,  124.356210,  107.713202,
+  3.191085,    4.495201,    5.686033,    8.365566,    11.275339,   14.706437,
+  20.300969,   28.152237,   35.688355,   39.341382,   41.030743,   55.752262,
+  75.211764,   106.980285,  120.608403,  124.680746,  130.222528,  112.260098,
+  6.136611,    7.305215,    7.272532,    10.646713,   15.630815,   22.383168,
+  31.349131,   42.419822,   52.301680,   58.983454,   58.915405,   69.161305,
+  98.992460,   117.713855,  124.344836,  130.623638,  138.442401,  127.846670,
+  11.707980,   13.490761,   11.640845,   14.176132,   22.131124,   33.776462,
+  47.365711,   61.603834,   75.281056,   83.463985,   85.510533,   86.026513,
+  108.787480,  123.031136,  130.607284,  138.954406,  160.867784,  158.958882,
+  27.062874,   32.195139,   24.147297,   22.114632,   35.580506,   52.551674,
+  71.652956,   88.606776,   102.107193,  110.703186,  114.398733,  111.118539,
+  121.503578,  132.455924,  139.490806,  161.412674,  193.563210,  172.203945,
+  35.625692,   47.953028,   42.639820,   42.276254,   58.815664,   84.977282,
+  110.656412,  126.168446,  134.658126,  140.604482,  144.006012,  141.702382,
+  140.125323,  153.122630,  164.748041,  194.156197,  206.854650,  174.013079,
+  49.516447,   65.335381,   71.738306,   81.872819,   98.400740,   136.840488,
+  163.775802,  169.440078,  172.747876,  171.222919,  171.679604,  172.173550,
+  168.200129,  187.617133,  199.683394,  207.768200,  210.062520,  175.478356,
+  60.341673,   92.487135,   119.907299,  136.068010,  144.778950,  189.443534,
+  220.120077,  219.641635,  214.616503,  205.894657,  198.453924,  200.013069,
+  195.938103,  206.118661,  210.447375,  212.061379,  216.078218,  181.162805,
+  78.422159,   112.242899,  158.416312,  181.404320,  193.188690,  229.296967,
+  270.461799,  275.168977,  256.511701,  244.706786,  231.344608,  226.065087,
+  222.248618,  218.662324,  217.966722,  218.248574,  218.818588,  182.740573,
+  88.713664,   123.594164,  172.928179,  213.781414,  245.800351,  252.063414,
+  313.283141,  331.703831,  305.866639,  285.177142,  269.759635,  251.988739,
+  245.998388,  232.688076,  230.588702,  230.882657,  230.319053,  192.120741,
+  102.540561,  152.905927,  189.137131,  241.806756,  273.868497,  284.258017,
+  339.689853,  373.561104,  362.657463,  326.291984,  311.922687,  290.460189,
+  276.774381,  273.012072,  277.751792,  279.123748,  278.820447,  233.813798,
+  132.983118,  176.307242,  197.415684,  243.307787,  280.893995,  332.922370,
+  340.329043,  404.530166,  419.475405,  375.775209,  351.300889,  340.042759,
+  315.683832,  306.123530,  306.359319,  306.733063,  307.609556,  261.647847,
+  149.579109,  185.925581,  207.937033,  245.159084,  301.890957,  350.040480,
+  352.250771,  418.742329,  458.112686,  430.125208,  386.460441,  380.346839,
+  354.679150,  337.305620,  334.504124,  335.889932,  341.060725,  286.898578,
+  153.576812,  202.105624,  219.366967,  248.524506,  314.255692,  350.607526,
+  390.567688,  408.629209,  488.000213,  480.563823,  432.461799,  410.412624,
+  398.607371,  400.188740,  402.780916,  408.853470,  430.449735,  363.777088,
+  161.353129,  214.848904,  231.549852,  258.536466,  313.163177,  368.140577,
+  412.136393,  413.409032,  499.838438,  519.571063,  485.833867,  444.562715,
+  435.738129,  442.358549,  450.166531,  453.208524,  458.424358,  385.823139,
+  175.109034,  227.608058,  250.069563,  286.101747,  312.256740,  378.421485,
+  413.344147,  435.058646,  476.960941,  542.448886,  530.189154,  495.408402,
+  475.326752,  465.017144,  464.694045,  465.144689,  466.905382,  398.669138,
+  184.750180,  240.766694,  283.240772,  305.480150,  322.409001,  374.526162,
+  427.141326,  452.840323,  472.604139,  545.366105,  567.676694,  541.666203,
+  509.591873,  492.044219,  492.778569,  493.765684,  493.235693,  413.684325,
+  194.728357,  254.928927,  289.991157,  300.193195,  324.194589,  371.563147,
+  439.226438,  468.295088,  495.654854,  533.506353,  587.476353,  578.298989,
+  548.041942,  527.393885,  538.965146,  545.070442,  544.295454,  454.012211,
+  205.195287,  283.135677,  297.921431,  319.295927,  355.621830,  392.466463,
+  446.696167,  485.053519,  516.426615,  532.264584,  588.481600,  615.906737,
+  589.319634,  555.754316,  558.389367,  569.094521,  569.779764,  475.384946,
+  218.552054,  298.511016,  319.188338,  351.781666,  372.789510,  412.827434,
+  464.569387,  506.270203,  533.049810,  553.347364,  580.644599,  632.759854,
+  622.235843,  569.960552,  580.799340,  586.553714,  579.488366,  491.826482,
+  244.803348,  299.790203,  324.187975,  363.280782,  403.710443,  441.724083,
+  492.732682,  534.722691,  552.193622,  575.112647,  586.097705,  635.224970,
+  644.642944,  606.017786,  640.321218,  642.316989,  616.397020,  548.300111,
+  256.957358,  318.638991,  355.063346,  389.889307,  433.607315,  468.209001,
+  515.178157,  573.556591,  578.113115,  587.246475,  601.762801,  638.454644,
+  656.574853,  641.184609,  676.908189,  684.198162,  678.387412,  574.805864,
+  251.211502,  323.448532,  364.227424,  411.792704,  462.226488,  503.572288,
+  549.299249,  599.124071,  601.227977,  597.118176,  613.247552,  633.278532,
+  658.074755,  664.930719,  685.731531,  693.632845,  693.076350,  578.326477,
+  267.695377,  354.273736,  389.976833,  438.518178,  493.332686,  544.343027,
+  588.895829,  620.206193,  628.327410,  606.067827,  620.998532,  657.985256,
+  683.936059,  691.345257,  693.894723,  695.175306,  693.618786,  578.517148,
+  274.290725,  363.465288,  411.808596,  463.369805,  515.310226,  581.009306,
+  613.070738,  636.638714,  647.333929,  629.867603,  644.646319,  687.796202,
+  702.859596,  713.495479,  704.068069,  704.991807,  704.188594,  587.283658,
+  302.538449,  389.174737,  438.518422,  493.398902,  547.662399,  601.981814,
+  624.773046,  641.629484,  644.699451,  645.848784,  668.033340,  703.643523,
+  707.422408,  717.329600,  726.298973,  744.127507,  745.365167,  617.954068,
+  310.328188,  410.984766,  463.369805,  515.315010,  581.309832,  613.787792,
+  634.988538,  654.145284,  662.632978,  668.413496,  706.494057,  750.545471,
+  730.724808,  730.002100,  743.625262,  750.801609,  745.308457,  606.505800,
+  329.948756,  437.600191,  493.398902,  547.661910,  601.917884,  622.557745,
+  633.244395,  644.055898,  648.224221,  665.062911,  763.555733,  812.391078,
+  769.063582,  744.865168,  727.579796,  724.950408,  722.179707,  598.564510,
+  350.848328,  462.437458,  515.315010,  581.309823,  613.779123,  634.465309,
+  652.056257,  662.179143,  671.466297,  726.881256,  819.824030,  880.232789,
+  810.371672,  754.246481,  725.053473,  724.253390,  723.503395,  603.394909,
+  373.704088,  492.408266,  547.661910,  601.917884,  622.557620,  633.236320,
+  644.023513,  648.232514,  666.381639,  785.498283,  929.441612,  999.772800,
+  890.339033,  775.852504,  731.840181,  726.905100,  725.251844,  604.899901,
+  394.473422,  514.261306,  581.309823,  613.779123,  634.465309,  652.056257,
+  662.179143,  671.466557,  727.134512,  835.764144,  981.747089,  1018.462934,
+  939.686967,  811.276731,  739.398459,  727.365647,  725.285425,  604.923525,
+  419.976505,  546.538939,  601.917884,  622.557620,  633.236320,  644.023513,
+  648.232514,  666.381639,  785.545191,  932.841398,  1036.609617, 1026.945092,
+  963.822765,  840.827315,  755.532423,  730.241865,  725.366847,  604.924155,
+  437.281359,  580.116337,  613.779123,  634.465309,  652.056257,  662.179143,
+  671.466557,  727.134512,  835.764859,  981.996194,  1031.896881, 1002.544732,
+  881.157178,  828.151494,  799.340975,  751.314325,  728.316587,  605.005504,
+  464.713920,  600.649281,  622.557620,  633.236320,  644.023513,  648.232514,
+  666.381639,  785.545191,  932.841398,  1036.735329, 1035.037004, 995.478339,
+  858.093733,  823.471976,  819.881754,  798.749289,  749.440463,  607.955244,
+  495.880237,  612.473139,  634.465309,  652.056257,  662.179143,  671.466557,
+  727.134512,  835.764859,  981.996194,  1032.339788, 1031.105117, 995.303259,
+  857.733663,  823.435877,  822.822791,  819.873050,  796.882480,  629.038445,
+  510.391280,  621.158273,  633.236320,  644.023513,  648.232514,  666.381639,
+  785.545191,  932.841398,  1036.735329, 1035.566013, 1029.599350, 994.926093,
+  857.645648,  823.435143,  822.904139,  822.822791,  817.965681,  673.856962,
+  514.588176,  632.947715,  652.056257,  662.179143,  671.466557,  727.134512,
+  835.764859,  981.996194,  1032.339788, 1031.547475, 1023.835377, 972.158629,
+  851.968626,  823.347128,  822.904770,  822.904139,  820.752301,  684.418900,
+  520.013294,  631.668183,  644.023513,  648.232514,  666.381639,  785.545191,
+  932.841398,  1036.735329, 1035.567378, 1029.776746, 1001.044108, 880.853721,
+  829.201546,  822.994150,  822.904770,  822.904770,  820.792975,  684.582020,
+  531.253628,  650.479606,  662.179143,  671.466557,  727.134512,  835.764859,
+  981.996194,  1032.339788, 1031.636855, 1029.601779, 995.366703,  858.086641,
+  823.524524,  822.906135,  822.904770,  822.904770,  820.792975,  684.582020,
+  528.531744,  642.424501,  648.232514,  666.381639,  785.545191,  932.841398,
+  1036.735329, 1035.567378, 1030.219103, 1029.576226, 995.278687,  857.733663,
+  823.436508,  822.904770,  822.904770,  822.904770,  820.792975,  684.582020,
+  545.401164,  660.550678,  671.508859,  727.304161,  835.807162,  981.996850,
+  1032.339788, 1031.636855, 1030.130788, 1029.487827, 994.925709,  857.645648,
+  823.435143,  822.904770,  822.904770,  822.904770,  820.792975,  684.582020,
+  537.684760,  646.650947,  669.110131,  796.487512,  935.569890,  1036.777631,
+  1035.567378, 1030.219103, 1030.018584, 1023.810805, 972.158629,  851.968626,
+  823.347128,  822.904770,  822.904770,  822.904770,  820.792975,  684.582020,
+  552.408370,  670.001885,  738.246482,  879.690154,  992.939171,  1032.509436,
+  1031.636855, 1030.132153, 1029.665223, 1001.043724, 880.853721,  829.201546,
+  822.994150,  822.904770,  822.904770,  822.904770,  820.792975,  684.582020,
+  539.835902,  667.496388,  799.216004,  946.512211,  1039.506123, 1035.609680,
+  1030.219103, 1030.107964, 1029.577207, 995.366703,  858.086641,  823.524524,
+  822.906135,  822.904770,  822.904770,  822.904770,  820.792975,  684.582020,
+  558.362529,  734.277451,  877.197218,  990.478243,  1029.908393, 1028.993978,
+  1027.488620, 1027.464048, 1026.933674, 992.724534,  855.532488,  821.323349,
+  820.792975,  820.792975,  820.792975,  820.792975,  818.686600,  682.825198,
+  453.127195,  649.075095,  780.278390,  867.165890,  862.469711,  857.067460,
+  856.956321,  856.955937,  856.513579,  827.981461,  713.556496,  685.024378,
+  684.582020,  684.582020,  684.582020,  684.582020,  682.825198,  569.510056,
+};
+
+static const double interp_dgrid_surf[65 * 18] = {
+  10.650434, 12.204694, 12.040917, 11.843008, 11.845578, 12.051535, 12.103583,
+  12.136780, 12.266709, 12.299107, 12.299673, 12.303120, 12.316337, 12.293431,
+  12.092165, 11.602421, 11.141559, 8.864495,  12.770003, 14.634889, 14.437149,
+  14.199413, 14.202487, 14.449423, 14.511827, 14.551629, 14.707410, 14.746265,
+  14.747610, 14.753705, 14.762194, 14.699395, 14.390525, 13.690970, 12.874168,
+  10.367121, 12.832328, 14.790730, 14.503765, 14.236403, 14.239028, 14.486600,
+  14.549164, 14.589069, 14.745250, 14.784258, 14.788320, 14.801930, 14.762798,
+  14.499088, 14.021544, 13.469684, 12.661560, 10.108384, 12.950520, 15.264726,
+  14.621957, 14.238236, 14.239028, 14.486601, 14.549264, 14.589469, 14.745361,
+  14.784949, 14.791572, 14.798652, 14.660251, 14.119394, 13.651131, 12.935657,
+  12.176082, 9.228999,  12.979992, 15.382918, 14.651428, 14.238693, 14.239028,
+  14.486701, 14.555710, 14.615321, 14.751849, 14.787700, 14.797104, 14.743189,
+  14.475057, 13.944406, 13.450468, 12.687876, 11.824993, 8.906683,  12.980449,
+  15.384750, 14.651885, 14.238700, 14.239028, 14.487102, 14.581562, 14.718998,
+  14.777721, 14.788445, 14.778661, 14.582790, 14.099785, 13.649637, 12.935359,
+  12.201859, 10.891931, 8.482221,  12.980449, 15.384750, 14.651886, 14.238801,
+  14.239434, 14.487303, 14.588010, 14.744860, 14.784773, 14.786094, 14.735647,
+  14.455704, 13.939591, 13.450393, 12.687876, 11.849334, 10.476658, 8.043672,
+  12.980449, 15.384750, 14.651987, 14.245320, 14.265579, 14.493824, 14.588211,
+  14.745312, 14.787263, 14.775934, 14.582036, 14.099475, 13.649563, 12.935358,
+  12.201859, 10.911285, 9.730570,  6.696921,  12.980449, 15.384750, 14.652393,
+  14.271466, 14.370434, 14.520069, 14.589027, 14.746028, 14.785482, 14.735605,
+  14.455693, 13.939590, 13.450393, 12.687876, 11.849334, 10.494514, 9.195398,
+  6.215460,  12.980449, 15.384750, 14.652494, 14.277985, 14.396679, 14.533035,
+  14.615021, 14.754825, 14.775610, 14.582796, 14.099664, 13.649565, 12.935358,
+  12.201859, 10.911285, 9.747361,  7.779960,  5.617541,  12.980448, 15.384731,
+  14.652415, 14.278078, 14.397578, 14.559053, 14.718657, 14.776398, 14.747044,
+  14.504690, 13.951810, 13.450583, 12.687876, 11.849334, 10.494514, 9.210817,
+  7.210003,  5.164575,  12.980446, 15.383448, 14.647073, 14.277541, 14.403813,
+  14.569546, 14.744956, 14.765103, 14.629073, 14.296161, 13.698573, 12.936118,
+  12.201859, 10.911285, 9.747361,  7.790897,  6.322998,  3.931551,  12.981550,
+  15.376916, 14.615597, 14.274820, 14.437479, 14.575942, 14.707492, 14.734111,
+  14.515975, 14.000806, 13.462803, 12.688066, 11.849334, 10.494514, 9.210817,
+  7.219566,  5.781392,  3.486081,  12.991899, 15.376201, 14.579444, 14.296898,
+  14.473361, 14.522910, 14.491600, 14.543267, 14.288580, 13.700311, 12.936579,
+  12.201867, 10.911285, 9.747361,  7.790897,  6.331506,  4.480348,  2.923138,
+  13.019848, 15.383477, 14.582260, 14.385262, 14.452673, 14.436019, 14.238174,
+  14.255993, 13.977481, 13.532342, 12.705591, 11.849605, 10.494514, 9.210817,
+  7.219566,  5.789642,  4.018194,  2.766222,  13.028558, 15.315782, 14.439141,
+  14.326286, 14.452429, 14.311731, 14.033235, 13.922587, 13.665868, 13.207897,
+  12.274375, 10.912967, 9.747371,  7.790897,  6.331506,  4.488594,  3.454993,
+  2.692682,  12.992752, 15.321471, 14.409573, 14.236340, 14.322969, 14.049072,
+  13.764823, 13.479242, 13.250105, 12.759133, 12.019174, 10.532951, 9.211409,
+  7.219566,  5.789642,  4.026440,  3.298077,  2.674624,  12.945493, 15.276596,
+  14.315745, 14.026198, 14.085774, 13.844563, 13.447576, 12.964935, 12.735525,
+  12.288592, 11.511693, 9.900227,  7.793270,  6.331506,  4.488594,  3.463236,
+  3.224318,  2.672433,  12.757570, 15.056661, 14.095011, 13.722362, 13.812624,
+  13.608480, 13.021206, 12.367627, 11.937931, 11.581049, 10.599552, 9.247860,
+  7.220151,  5.789642,  4.026437,  3.305882,  3.191260,  2.615317,  12.581293,
+  14.824658, 13.909074, 13.496158, 13.491402, 13.221550, 12.514140, 11.677229,
+  10.936895, 10.619912, 9.634779,  7.763570,  6.331082,  4.488590,  3.462798,
+  3.216460,  3.076315,  2.373499,  12.283499, 14.455760, 13.890593, 13.427587,
+  13.183783, 12.763833, 11.861006, 10.740618, 9.820756,  9.354945,  8.669862,
+  7.123268,  5.787860,  4.025994,  3.290000,  3.084410,  2.810905,  2.222916,
+  12.010893, 14.300919, 13.986624, 13.484026, 13.025385, 12.224281, 11.064265,
+  9.631040,  8.594396,  8.003736,  7.561587,  6.274418,  4.466637,  3.446574,
+  3.102467,  2.816989,  2.598688,  1.951541,  11.581477, 13.831132, 13.632027,
+  13.380414, 12.807880, 11.665651, 10.218236, 8.562237,  7.222614,  6.611808,
+  6.261676,  5.402793,  3.938544,  3.174375,  2.818166,  2.602758,  2.213911,
+  1.434763,  11.050735, 12.893449, 12.363152, 12.712829, 12.012961, 10.887854,
+  9.109699,  7.421701,  5.965603,  5.272129,  4.991435,  4.423000,  3.369988,
+  2.800371,  2.593901,  2.217431,  1.670917,  1.215265,  10.641194, 11.766277,
+  10.777082, 10.972917, 10.689298, 9.701545,  7.719947,  6.145654,  4.872442,
+  4.099600,  3.880934,  3.514159,  2.786474,  2.368963,  2.162376,  1.673670,
+  1.450770,  1.185424,  10.071964, 11.107701, 9.172361,  8.551313,  8.412080,
+  7.641397,  6.174246,  4.853916,  3.904549,  3.246810,  2.959903,  2.785066,
+  2.240001,  1.793166,  1.585520,  1.449824,  1.405368,  1.168856,  9.213182,
+  9.173278,  7.219231,  6.242951,  5.626013,  5.768007,  4.908666,  3.809589,
+  3.115109,  2.617899,  2.274793,  2.172960,  1.838597,  1.505915,  1.414333,
+  1.392666,  1.338173,  1.105611,  7.365015,  7.471370,  5.622346,  4.520127,
+  3.936272,  4.208822,  3.623024,  2.977794,  2.450003,  2.097261,  1.824090,
+  1.643270,  1.473525,  1.351388,  1.327504,  1.323865,  1.307894,  1.088234,
+  6.198210,  6.580712,  4.682511,  3.416952,  2.941929,  2.766637,  2.650686,
+  2.315439,  1.925838,  1.659784,  1.464419,  1.252806,  1.162722,  1.197518,
+  1.199875,  1.197365,  1.194040,  0.995797,  5.402507,  5.055466,  3.728724,
+  2.624359,  2.165810,  1.943189,  1.918190,  1.738078,  1.516328,  1.290520,
+  1.155793,  1.015962,  0.881900,  0.807203,  0.754242,  0.743378,  0.740288,
+  0.614158,  3.937867,  3.862507,  2.884664,  2.088147,  1.648496,  1.473584,
+  1.340123,  1.291769,  1.165381,  1.000224,  0.893316,  0.821333,  0.691363,
+  0.610501,  0.586766,  0.583762,  0.577840,  0.468733,  3.104660,  3.181078,
+  2.420208,  1.747442,  1.297956,  1.109835,  0.970385,  0.943229,  0.876923,
+  0.777584,  0.678183,  0.628623,  0.553745,  0.523430,  0.519490,  0.514394,
+  0.492259,  0.403172,  2.593833,  2.533720,  2.010452,  1.480944,  1.060302,
+  0.846383,  0.738703,  0.673144,  0.658010,  0.592449,  0.518236,  0.470335,
+  0.425088,  0.393168,  0.378116,  0.355846,  0.275469,  0.213128,  2.176988,
+  2.089575,  1.671284,  1.225008,  0.895382,  0.672008,  0.566241,  0.496746,
+  0.488005,  0.449874,  0.400899,  0.354002,  0.318150,  0.281533,  0.238545,
+  0.224159,  0.202399,  0.160681,  1.874679,  1.769165,  1.430124,  1.068727,
+  0.780272,  0.557801,  0.441643,  0.377256,  0.352957,  0.338452,  0.304965,
+  0.273172,  0.240052,  0.208724,  0.193431,  0.190845,  0.185025,  0.138166,
+  1.590226,  1.502830,  1.193127,  0.917885,  0.670432,  0.474546,  0.355420,
+  0.292305,  0.259035,  0.249937,  0.232079,  0.208943,  0.181936,  0.160038,
+  0.152257,  0.151235,  0.149583,  0.120747,  1.331730,  1.255907,  1.012871,
+  0.778422,  0.578977,  0.412432,  0.293155,  0.231824,  0.197187,  0.183921,
+  0.174876,  0.157252,  0.140263,  0.127050,  0.110244,  0.105041,  0.104323,
+  0.086944,  1.153994,  1.118771,  0.822355,  0.612321,  0.478249,  0.348222,
+  0.247408,  0.186141,  0.152714,  0.135445,  0.129810,  0.119994,  0.115619,
+  0.131626,  0.095612,  0.079343,  0.077502,  0.064550,  0.946317,  0.925894,
+  0.677969,  0.499906,  0.397101,  0.297931,  0.214467,  0.152333,  0.120731,
+  0.102686,  0.095062,  0.090361,  0.122319,  0.240194,  0.112687,  0.070690,
+  0.070461,  0.054194,  0.824155,  0.787241,  0.581856,  0.419228,  0.313167,
+  0.245582,  0.183500,  0.128101,  0.096577,  0.080267,  0.071022,  0.066851,
+  0.085754,  0.154163,  0.075884,  0.052401,  0.054270,  0.026656,  0.716310,
+  0.671378,  0.489580,  0.349569,  0.256155,  0.206343,  0.157853,  0.111950,
+  0.079271,  0.062518,  0.053441,  0.049660,  0.051400,  0.063778,  0.039993,
+  0.029133,  0.023382,  0.013725,  0.614125,  0.579096,  0.417126,  0.299465,
+  0.217849,  0.165515,  0.129040,  0.093127,  0.065612,  0.049543,  0.041429,
+  0.036850,  0.034416,  0.033989,  0.024216,  0.017377,  0.014833,  0.011987,
+  0.520407,  0.487239,  0.349473,  0.251741,  0.184897,  0.135813,  0.107098,
+  0.073607,  0.053938,  0.040531,  0.032931,  0.028876,  0.025759,  0.022168,
+  0.016739,  0.014638,  0.014333,  0.011947,  0.449954,  0.415124,  0.299452,
+  0.216942,  0.158874,  0.115334,  0.088821,  0.060105,  0.042610,  0.032566,
+  0.026903,  0.023123,  0.019913,  0.016835,  0.014306,  0.013625,  0.013535,
+  0.011284,  0.377618,  0.347773,  0.251741,  0.184839,  0.132857,  0.095439,
+  0.070462,  0.052244,  0.036078,  0.026025,  0.021518,  0.018487,  0.015361,
+  0.012905,  0.011470,  0.010569,  0.010283,  0.008297,  0.319953,  0.297976,
+  0.216942,  0.158842,  0.113280,  0.080426,  0.057367,  0.041987,  0.030135,
+  0.022295,  0.017901,  0.015121,  0.012224,  0.010035,  0.009353,  0.009108,
+  0.008695,  0.006139,  0.267864,  0.250502,  0.184839,  0.132851,  0.095039,
+  0.068220,  0.049135,  0.035315,  0.025144,  0.018237,  0.013857,  0.012094,
+  0.009715,  0.007743,  0.006937,  0.006446,  0.006243,  0.004929,  0.230449,
+  0.215895,  0.158842,  0.113280,  0.080417,  0.057174,  0.041304,  0.029959,
+  0.021866,  0.015673,  0.012133,  0.010083,  0.007801,  0.006053,  0.005401,
+  0.003834,  0.003429,  0.002851,  0.193984,  0.183963,  0.132851,  0.095039,
+  0.068220,  0.049133,  0.035305,  0.025140,  0.018150,  0.013175,  0.010422,
+  0.008491,  0.006397,  0.004567,  0.003494,  0.002933,  0.002825,  0.002355,
+  0.167298,  0.158088,  0.113280,  0.080417,  0.057174,  0.041304,  0.029959,
+  0.021866,  0.015669,  0.011955,  0.009257,  0.007051,  0.005543,  0.003905,
+  0.002984,  0.002825,  0.002814,  0.002347,  0.143228,  0.132220,  0.095039,
+  0.068220,  0.049133,  0.035305,  0.025140,  0.018150,  0.013174,  0.010394,
+  0.008403,  0.006661,  0.005378,  0.003545,  0.002876,  0.002818,  0.002814,
+  0.002347,  0.122934,  0.112735,  0.080417,  0.057174,  0.041304,  0.029959,
+  0.021866,  0.015669,  0.011955,  0.009258,  0.007182,  0.006012,  0.003762,
+  0.002866,  0.002739,  0.002788,  0.002810,  0.002347,  0.101934,  0.094569,
+  0.068220,  0.049133,  0.035305,  0.025140,  0.018150,  0.013174,  0.010394,
+  0.008405,  0.006797,  0.005845,  0.003333,  0.002703,  0.002695,  0.002723,
+  0.002781,  0.002343,  0.086702,  0.080014,  0.057174,  0.041304,  0.029959,
+  0.021866,  0.015669,  0.011955,  0.009258,  0.007190,  0.006533,  0.005839,
+  0.003326,  0.002700,  0.002690,  0.002694,  0.002716,  0.002314,  0.073040,
+  0.067886,  0.049133,  0.035305,  0.025140,  0.018150,  0.013174,  0.010394,
+  0.008405,  0.006807,  0.006468,  0.005831,  0.003325,  0.002700,  0.002690,
+  0.002690,  0.002687,  0.002253,  0.061685,  0.056890,  0.041304,  0.029959,
+  0.021866,  0.015669,  0.011955,  0.009258,  0.007190,  0.006542,  0.006360,
+  0.005416,  0.003221,  0.002698,  0.002690,  0.002690,  0.002683,  0.002238,
+  0.052465,  0.048894,  0.035305,  0.025140,  0.018150,  0.013174,  0.010394,
+  0.008405,  0.006807,  0.006472,  0.005943,  0.003748,  0.002805,  0.002692,
+  0.002690,  0.002690,  0.002683,  0.002238,  0.043838,  0.041101,  0.029959,
+  0.021866,  0.015669,  0.011955,  0.009258,  0.007190,  0.006543,  0.006465,
+  0.005839,  0.003333,  0.002702,  0.002690,  0.002690,  0.002690,  0.002683,
+  0.002238,  0.037824,  0.035133,  0.025140,  0.018150,  0.013174,  0.010394,
+  0.008405,  0.006807,  0.006480,  0.006464,  0.005838,  0.003326,  0.002700,
+  0.002690,  0.002690,  0.002690,  0.002683,  0.002238,  0.031865,  0.029815,
+  0.021866,  0.015668,  0.011955,  0.009258,  0.007190,  0.006543,  0.006475,
+  0.006462,  0.005831,  0.003325,  0.002700,  0.002690,  0.002690,  0.002690,
+  0.002683,  0.002238,  0.027150,  0.025016,  0.018128,  0.013083,  0.010371,
+  0.008405,  0.006807,  0.006480,  0.006472,  0.006359,  0.005416,  0.003221,
+  0.002698,  0.002690,  0.002690,  0.002690,  0.002683,  0.002238,  0.023094,
+  0.021760,  0.015577,  0.011590,  0.009167,  0.007188,  0.006543,  0.006475,
+  0.006466,  0.005943,  0.003748,  0.002805,  0.002692,  0.002690,  0.002690,
+  0.002690,  0.002683,  0.002238,  0.019269,  0.018038,  0.013060,  0.010280,
+  0.008382,  0.006806,  0.006480,  0.006474,  0.006464,  0.005839,  0.003333,
+  0.002702,  0.002690,  0.002690,  0.002690,  0.002690,  0.002683,  0.002238,
+  0.016874,  0.015472,  0.011566,  0.009148,  0.007171,  0.006527,  0.006458,
+  0.006457,  0.006447,  0.005823,  0.003318,  0.002693,  0.002683,  0.002683,
+  0.002683,  0.002683,  0.002676,  0.002232,  0.011968,  0.011056,  0.008762,
+  0.007219,  0.005717,  0.005391,  0.005386,  0.005386,  0.005377,  0.004856,
+  0.002767,  0.002246,  0.002238,  0.002238,  0.002238,  0.002238,  0.002232,
+  0.001862,
+};
+
+void av1_model_rd_surffit(double xm, double yl, double *rate_f,
+                          double *dist_f) {
+  const double x_start = -0.5;
+  const double x_end = 16.5;
+  const double x_step = 1;
+  const double y_start = -15.5;
+  const double y_end = 16.5;
+  const double y_step = 0.5;
+  const double epsilon = 1e-6;
+  const int stride = (int)rint((x_end - x_start) / x_step) + 1;
+  (void)y_end;
+
+  xm = AOMMAX(xm, x_start + x_step + epsilon);
+  xm = AOMMIN(xm, x_end - x_step - epsilon);
+  yl = AOMMAX(yl, y_start + y_step + epsilon);
+  yl = AOMMIN(yl, y_end - y_step - epsilon);
+
+  const double y = (yl - y_start) / y_step;
+  const double x = (xm - x_start) / x_step;
+
+  const int yi = (int)floor(y);
+  const int xi = (int)floor(x);
+  assert(xi > 0);
+  assert(yi > 0);
+
+  const double yo = y - yi;
+  const double xo = x - xi;
+  const double *prate = &interp_rgrid_surf[(yi - 1) * stride + (xi - 1)];
+  const double *pdist = &interp_dgrid_surf[(yi - 1) * stride + (xi - 1)];
+  *rate_f = interp_bicubic(prate, stride, xo, yo);
+  *dist_f = interp_bicubic(pdist, stride, xo, yo);
+}
+
+static const double interp_rgrid_curv[65] = {
+  0.000000,    0.000000,    0.000000,    0.000000,    0.000000,     0.000000,
+  0.000000,    0.000000,    0.000000,    0.000000,    0.000000,     0.000000,
+  0.000000,    0.000000,    0.000000,    0.000000,    0.000000,     4.759876,
+  8.132086,    13.651828,   21.908271,   33.522054,   48.782376,    71.530983,
+  106.728649,  151.942795,  199.893011,  242.850965,  283.933923,   322.154203,
+  360.684608,  394.801656,  426.879017,  460.234313,  484.103987,   508.261495,
+  536.486763,  558.196737,  586.285894,  614.764511,  634.166333,   647.706472,
+  658.211478,  681.360407,  701.052141,  727.007310,  768.663973,   804.407660,
+  884.627751,  1065.658131, 1238.875214, 1440.185176, 1678.377931,  1962.243390,
+  2300.571467, 2702.152072, 3175.775119, 3730.230519, 4374.308184,  5116.798028,
+  5966.489961, 6932.173897, 8022.639747, 9246.677424, 10613.076839,
+};
+
+static const double interp_dgrid_curv[65] = {
+  14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855,
+  14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.555776, 14.533692,
+  14.439920, 14.257791, 13.977230, 13.623229, 13.064884, 12.355411, 11.560773,
+  10.728960, 9.861975,  8.643612,  6.916021,  5.154769,  3.734940,  2.680051,
+  1.925506,  1.408410,  1.042223,  0.767641,  0.565392,  0.420116,  0.310427,
+  0.231711,  0.172999,  0.128293,  0.094992,  0.072171,  0.052972,  0.039354,
+  0.029555,  0.022857,  0.016832,  0.013297,  0.000000,  0.000000,  0.000000,
+  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,
+  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,
+  0.000000,  0.000000,
+};
+
+void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f) {
+  const double x_start = -15.5;
+  const double x_end = 16.5;
+  const double x_step = 0.5;
+  const double epsilon = 1e-6;
+  (void)x_end;
+
+  xqr = AOMMAX(xqr, x_start + x_step + epsilon);
+  xqr = AOMMIN(xqr, x_end - x_step - epsilon);
+  const double x = (xqr - x_start) / x_step;
+  const int xi = (int)floor(x);
+  const double xo = x - xi;
+
+  assert(xi > 0);
+
+  const double *prate = &interp_rgrid_curv[(xi - 1)];
+  const double *pdist = &interp_dgrid_curv[(xi - 1)];
+  *rate_f = interp_cubic(prate, xo);
+  *distbysse_f = interp_cubic(pdist, xo);
+}
+
 static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize,
                                        const struct macroblockd_plane *pd,
                                        ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
@@ -814,8 +1281,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
     rd->thresh_mult[THR_NEARESTG] = 0;
   }
 
-  rd->thresh_mult[THR_DC] += 1000;
-
   rd->thresh_mult[THR_NEWMV] += 1000;
   rd->thresh_mult[THR_NEWL2] += 1000;
   rd->thresh_mult[THR_NEWL3] += 1000;
@@ -840,8 +1305,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_GLOBALG] += 2000;
   rd->thresh_mult[THR_GLOBALA] += 2000;
 
-  rd->thresh_mult[THR_PAETH] += 1000;
-
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000;
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000;
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 1000;
@@ -956,15 +1419,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEW_NEWGA2] += 2000;
   rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] += 2500;
 
-  rd->thresh_mult[THR_H_PRED] += 2000;
-  rd->thresh_mult[THR_V_PRED] += 2000;
-  rd->thresh_mult[THR_D135_PRED] += 2500;
-  rd->thresh_mult[THR_D203_PRED] += 2500;
-  rd->thresh_mult[THR_D157_PRED] += 2500;
-  rd->thresh_mult[THR_D67_PRED] += 2500;
-  rd->thresh_mult[THR_D113_PRED] += 2500;
-  rd->thresh_mult[THR_D45_PRED] += 2500;
-
   rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1600;
   rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 2000;
   rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] += 2000;
@@ -996,6 +1450,20 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEW_NEARBA] += 2200;
   rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2400;
   rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] += 3200;
+
+  rd->thresh_mult[THR_DC] += 1000;
+  rd->thresh_mult[THR_PAETH] += 1000;
+  rd->thresh_mult[THR_SMOOTH] += 2000;
+  rd->thresh_mult[THR_SMOOTH_V] += 2000;
+  rd->thresh_mult[THR_SMOOTH_H] += 2000;
+  rd->thresh_mult[THR_H_PRED] += 2000;
+  rd->thresh_mult[THR_V_PRED] += 2000;
+  rd->thresh_mult[THR_D135_PRED] += 2500;
+  rd->thresh_mult[THR_D203_PRED] += 2500;
+  rd->thresh_mult[THR_D157_PRED] += 2500;
+  rd->thresh_mult[THR_D67_PRED] += 2500;
+  rd->thresh_mult[THR_D113_PRED] += 2500;
+  rd->thresh_mult[THR_D45_PRED] += 2500;
 }
 
 void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) {
diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h
index 692367d7a..755b61df5 100644
--- a/third_party/aom/av1/encoder/rd.h
+++ b/third_party/aom/av1/encoder/rd.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_RD_H_
-#define AV1_ENCODER_RD_H_
+#ifndef AOM_AV1_ENCODER_RD_H_
+#define AOM_AV1_ENCODER_RD_H_
 
 #include <limits.h>
 
@@ -57,8 +57,6 @@ typedef enum {
   THR_NEARESTA,
   THR_NEARESTG,
 
-  THR_DC,
-
   THR_NEWMV,
   THR_NEWL2,
   THR_NEWL3,
@@ -100,12 +98,6 @@ typedef enum {
   THR_COMP_NEAREST_NEARESTLG,
   THR_COMP_NEAREST_NEARESTBA,
 
-  THR_PAETH,
-
-  THR_SMOOTH,
-  THR_SMOOTH_V,
-  THR_SMOOTH_H,
-
   THR_COMP_NEAR_NEARLA,
   THR_COMP_NEW_NEARESTLA,
   THR_COMP_NEAREST_NEWLA,
@@ -202,15 +194,6 @@ typedef enum {
   THR_COMP_NEW_NEWGA2,
   THR_COMP_GLOBAL_GLOBALGA2,
 
-  THR_H_PRED,
-  THR_V_PRED,
-  THR_D135_PRED,
-  THR_D203_PRED,
-  THR_D157_PRED,
-  THR_D67_PRED,
-  THR_D113_PRED,
-  THR_D45_PRED,
-
   THR_COMP_NEAR_NEARLL2,
   THR_COMP_NEW_NEARESTLL2,
   THR_COMP_NEAREST_NEWLL2,
@@ -243,7 +226,26 @@ typedef enum {
   THR_COMP_NEW_NEWBA,
   THR_COMP_GLOBAL_GLOBALBA,
 
-  MAX_MODES
+  THR_DC,
+  THR_PAETH,
+  THR_SMOOTH,
+  THR_SMOOTH_V,
+  THR_SMOOTH_H,
+  THR_H_PRED,
+  THR_V_PRED,
+  THR_D135_PRED,
+  THR_D203_PRED,
+  THR_D157_PRED,
+  THR_D67_PRED,
+  THR_D113_PRED,
+  THR_D45_PRED,
+
+  MAX_MODES,
+
+  LAST_SINGLE_REF_MODES = THR_GLOBALG,
+  MAX_SINGLE_REF_MODES = LAST_SINGLE_REF_MODES + 1,
+  LAST_COMP_REF_MODES = THR_COMP_GLOBAL_GLOBALBA,
+  MAX_COMP_REF_MODES = LAST_COMP_REF_MODES + 1
 } THR_MODES;
 
 typedef enum {
@@ -392,6 +394,10 @@ void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x,
 void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
                                   unsigned int qstep, int *rate, int64_t *dist);
 
+void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f);
+void av1_model_rd_surffit(double xm, double yl, double *rate_f,
+                          double *distbysse_f);
+
 int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
                             const MACROBLOCKD *xd);
 
@@ -455,4 +461,4 @@ void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_RD_H_
+#endif  // AOM_AV1_ENCODER_RD_H_
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c
index fef6d2875..c2d15534f 100644
--- a/third_party/aom/av1/encoder/rdopt.c
+++ b/third_party/aom/av1/encoder/rdopt.c
@@ -55,17 +55,97 @@
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/tokenize.h"
 #include "av1/encoder/tx_prune_model_weights.h"
 
-#define DNN_BASED_RD_INTERP_FILTER 0
+typedef void (*model_rd_for_sb_type)(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi,
+                                       const MACROBLOCK *const x,
+                                       BLOCK_SIZE plane_bsize, int plane,
+                                       int64_t sse, int num_samples, int *rate,
+                                       int64_t *dist);
 
-// Set this macro as 1 to collect data about tx size selection.
-#define COLLECT_TX_SIZE_DATA 0
+static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                            MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
+                            int plane_to, int mi_row, int mi_col,
+                            int *out_rate_sum, int64_t *out_dist_sum,
+                            int *skip_txfm_sb, int64_t *skip_sse_sb,
+                            int *plane_rate, int64_t *plane_sse,
+                            int64_t *plane_dist);
+static void model_rd_for_sb_with_curvfit(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_for_sb_with_surffit(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_for_sb_with_dnn(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_for_sb_with_fullrdy(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_from_sse(const AV1_COMP *const cpi,
+                              const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+                              int plane, int64_t sse, int num_samples,
+                              int *rate, int64_t *dist);
+static void model_rd_with_dnn(const AV1_COMP *const cpi,
+                              const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+                              int plane, int64_t sse, int num_samples,
+                              int *rate, int64_t *dist);
+static void model_rd_with_curvfit(const AV1_COMP *const cpi,
+                                  const MACROBLOCK *const x,
+                                  BLOCK_SIZE plane_bsize, int plane,
+                                  int64_t sse, int num_samples, int *rate,
+                                  int64_t *dist);
+static void model_rd_with_surffit(const AV1_COMP *const cpi,
+                                  const MACROBLOCK *const x,
+                                  BLOCK_SIZE plane_bsize, int plane,
+                                  int64_t sse, int num_samples, int *rate,
+                                  int64_t *dist);
 
-#if COLLECT_TX_SIZE_DATA
-static const char av1_tx_size_data_output_file[] = "tx_size_data.txt";
-#endif
+typedef enum {
+  MODELRD_LEGACY,
+  MODELRD_CURVFIT,
+  MODELRD_SUFFIT,
+  MODELRD_DNN,
+  MODELRD_FULLRDY,
+  MODELRD_TYPES
+} ModelRdType;
+
+static model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = {
+  model_rd_for_sb, model_rd_for_sb_with_curvfit, model_rd_for_sb_with_surffit,
+  model_rd_for_sb_with_dnn, model_rd_for_sb_with_fullrdy
+};
+
+static model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = {
+  model_rd_from_sse, model_rd_with_curvfit, model_rd_with_surffit,
+  model_rd_with_dnn, NULL
+};
+
+// 0: Legacy model
+// 1: Curve fit model
+// 2: Surface fit model
+// 3: DNN regression model
+// 4: Full rd model
+#define MODELRD_TYPE_INTERP_FILTER 1
+#define MODELRD_TYPE_TX_SEARCH_PRUNE 2
+#define MODELRD_TYPE_MASKED_COMPOUND 1
+#define MODELRD_TYPE_INTERINTRA 1
+#define MODELRD_TYPE_INTRA 1
+#define MODELRD_TYPE_JNT_COMPOUND 1
 
 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
 static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = {
@@ -103,6 +183,16 @@ typedef enum {
   FTXS_USE_TRANSFORM_DOMAIN = 1 << 2
 } FAST_TX_SEARCH_MODE;
 
+static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                               RD_STATS *rd_stats, BLOCK_SIZE bsize, int mi_row,
+                               int mi_col, int64_t ref_best_rd);
+
+static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                            RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                            int64_t non_skip_ref_best_rd,
+                            int64_t skip_ref_best_rd,
+                            FAST_TX_SEARCH_MODE ftxs_mode);
+
 struct rdcost_block_args {
   const AV1_COMP *cpi;
   MACROBLOCK *x;
@@ -112,6 +202,7 @@ struct rdcost_block_args {
   int64_t this_rd;
   int64_t best_rd;
   int exit_early;
+  int incomplete_exit;
   int use_fast_coef_costing;
   FAST_TX_SEARCH_MODE ftxs_mode;
 };
@@ -126,8 +217,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } },
   { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } },
 
-  { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
-
   { NEWMV, { LAST_FRAME, NONE_FRAME } },
   { NEWMV, { LAST2_FRAME, NONE_FRAME } },
   { NEWMV, { LAST3_FRAME, NONE_FRAME } },
@@ -172,12 +261,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
   { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
 
-  { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } },
-
-  { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
-
   { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
@@ -274,15 +357,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
   { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
 
-  { H_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { V_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D203_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D157_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D67_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D113_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
-
   { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } },
@@ -314,6 +388,21 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
   { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
   { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } },
+
+  // intra modes
+  { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { H_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { V_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D203_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D157_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D67_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D113_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
 };
 
 static const int16_t intra_to_mode_idx[INTRA_MODE_NUM] = {
@@ -451,7 +540,6 @@ static int get_prediction_mode_idx(PREDICTION_MODE this_mode,
   if (this_mode >= SINGLE_INTER_MODE_START &&
       this_mode < SINGLE_INTER_MODE_END) {
     assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
-    assert(second_ref_frame == NONE_FRAME);
     return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START]
                                    [ref_frame];
   }
@@ -479,6 +567,12 @@ static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
   UV_D113_PRED,   UV_D45_PRED,
 };
 
+typedef struct SingleInterModeState {
+  int64_t rd;
+  MV_REFERENCE_FRAME ref_frame;
+  int valid;
+} SingleInterModeState;
+
 typedef struct InterModeSearchState {
   int64_t best_rd;
   MB_MODE_INFO best_mbmode;
@@ -510,32 +604,21 @@ typedef struct InterModeSearchState {
   int_mv single_newmv[MAX_REF_MV_SERCH][REF_FRAMES];
   int single_newmv_rate[MAX_REF_MV_SERCH][REF_FRAMES];
   int single_newmv_valid[MAX_REF_MV_SERCH][REF_FRAMES];
-  int64_t modelled_rd[MB_MODE_COUNT][REF_FRAMES];
+  int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES];
+  // The rd of simple translation in single inter modes
+  int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES];
+
+  // Single search results by [directions][modes][reference frames]
+  SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
+  int single_state_cnt[2][SINGLE_INTER_MODE_NUM];
+  SingleInterModeState single_state_modelled[2][SINGLE_INTER_MODE_NUM]
+                                            [FWD_REFS];
+  int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM];
+
+  MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
 } InterModeSearchState;
 
 #if CONFIG_COLLECT_INTER_MODE_RD_STATS
-
-typedef struct InterModeRdModel {
-  int ready;
-  double a;
-  double b;
-  double dist_mean;
-  int skip_count;
-  int non_skip_count;
-  int fp_skip_count;
-  int bracket_idx;
-} InterModeRdModel;
-
-InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
-
-#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400
-static int inter_mode_data_idx[4];
-static int64_t inter_mode_data_sse[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
-static int64_t inter_mode_data_dist[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
-static int inter_mode_data_residue_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
-static int inter_mode_data_all_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
-static int64_t inter_mode_data_ref_best_rd[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
-
 int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
   if (bsize == BLOCK_8X8) return 1;
   if (bsize == BLOCK_16X16) return 2;
@@ -543,137 +626,152 @@ int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
   return -1;
 }
 
-void av1_inter_mode_data_init() {
+void av1_inter_mode_data_init(TileDataEnc *tile_data) {
   for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
-    const int block_idx = inter_mode_data_block_idx(i);
-    if (block_idx != -1) inter_mode_data_idx[block_idx] = 0;
-    InterModeRdModel *md = &inter_mode_rd_models[i];
+    InterModeRdModel *md = &tile_data->inter_mode_rd_models[i];
     md->ready = 0;
-    md->skip_count = 0;
-    md->non_skip_count = 0;
-    md->fp_skip_count = 0;
-    md->bracket_idx = 0;
+    md->num = 0;
+    md->dist_sum = 0;
+    md->ld_sum = 0;
+    md->sse_sum = 0;
+    md->sse_sse_sum = 0;
+    md->sse_ld_sum = 0;
   }
 }
 
-void av1_inter_mode_data_show(const AV1_COMMON *cm) {
-  printf("frame_offset %d\n", cm->frame_offset);
-  for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
-    const int block_idx = inter_mode_data_block_idx(i);
-    if (block_idx != -1) inter_mode_data_idx[block_idx] = 0;
-    InterModeRdModel *md = &inter_mode_rd_models[i];
-    if (md->ready) {
-      printf("bsize %d non_skip_count %d skip_count %d fp_skip_count %d\n", i,
-             md->non_skip_count, md->skip_count, md->fp_skip_count);
+static int get_est_rate_dist(TileDataEnc *tile_data, BLOCK_SIZE bsize,
+                             int64_t sse, int *est_residue_cost,
+                             int64_t *est_dist) {
+  aom_clear_system_state();
+  const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+  if (md->ready) {
+    const double est_ld = md->a * sse + md->b;
+    if (sse < md->dist_mean) {
+      *est_residue_cost = 0;
+      *est_dist = sse;
+    } else {
+      *est_residue_cost = (int)round((sse - md->dist_mean) / est_ld);
+      *est_dist = (int64_t)round(md->dist_mean);
     }
+    return 1;
   }
+  return 0;
 }
 
-static int64_t get_est_rd(BLOCK_SIZE bsize, int rdmult, int64_t sse,
-                          int curr_cost) {
-  aom_clear_system_state();
-  InterModeRdModel *md = &inter_mode_rd_models[bsize];
-  if (md->ready) {
-    const double est_ld = md->a * sse + md->b;
-    const double est_residue_cost = (sse - md->dist_mean) / est_ld;
-    const int64_t est_cost = (int64_t)round(est_residue_cost) + curr_cost;
-    const int64_t int64_dist_mean = (int64_t)round(md->dist_mean);
-    const int64_t est_rd = RDCOST(rdmult, est_cost, int64_dist_mean);
+static int64_t get_est_rd(TileDataEnc *tile_data, BLOCK_SIZE bsize, int rdmult,
+                          int64_t sse, int curr_cost) {
+  int est_residue_cost;
+  int64_t est_dist;
+  if (get_est_rate_dist(tile_data, bsize, sse, &est_residue_cost, &est_dist)) {
+    int rate = est_residue_cost + curr_cost;
+    int64_t est_rd = RDCOST(rdmult, rate, est_dist);
     return est_rd;
   }
   return 0;
 }
 
-#define DATA_BRACKETS 7
-static const int data_num_threshold[DATA_BRACKETS] = {
-  200, 400, 800, 1600, 3200, 6400, INT32_MAX
-};
-
-void av1_inter_mode_data_fit(int rdmult) {
+void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) {
   aom_clear_system_state();
   for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
     const int block_idx = inter_mode_data_block_idx(bsize);
-    InterModeRdModel *md = &inter_mode_rd_models[bsize];
+    InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
     if (block_idx == -1) continue;
-    int data_num = inter_mode_data_idx[block_idx];
-    if (data_num < data_num_threshold[md->bracket_idx]) {
+    if ((md->ready == 0 && md->num < 200) || (md->ready == 1 && md->num < 64)) {
       continue;
+    } else {
+      if (md->ready == 0) {
+        md->dist_mean = md->dist_sum / md->num;
+        md->ld_mean = md->ld_sum / md->num;
+        md->sse_mean = md->sse_sum / md->num;
+        md->sse_sse_mean = md->sse_sse_sum / md->num;
+        md->sse_ld_mean = md->sse_ld_sum / md->num;
+      } else {
+        const double factor = 3;
+        md->dist_mean =
+            (md->dist_mean * factor + (md->dist_sum / md->num)) / (factor + 1);
+        md->ld_mean =
+            (md->ld_mean * factor + (md->ld_sum / md->num)) / (factor + 1);
+        md->sse_mean =
+            (md->sse_mean * factor + (md->sse_sum / md->num)) / (factor + 1);
+        md->sse_sse_mean =
+            (md->sse_sse_mean * factor + (md->sse_sse_sum / md->num)) /
+            (factor + 1);
+        md->sse_ld_mean =
+            (md->sse_ld_mean * factor + (md->sse_ld_sum / md->num)) /
+            (factor + 1);
+      }
+
+      const double my = md->ld_mean;
+      const double mx = md->sse_mean;
+      const double dx = sqrt(md->sse_sse_mean);
+      const double dxy = md->sse_ld_mean;
+
+      md->a = (dxy - mx * my) / (dx * dx - mx * mx);
+      md->b = my - md->a * mx;
+      md->ready = 1;
+
+      md->num = 0;
+      md->dist_sum = 0;
+      md->ld_sum = 0;
+      md->sse_sum = 0;
+      md->sse_sse_sum = 0;
+      md->sse_ld_sum = 0;
     }
-    double my = 0;
-    double mx = 0;
-    double dx = 0;
-    double dxy = 0;
-    double dist_mean = 0;
-    const int train_num = data_num;
-    for (int i = 0; i < train_num; ++i) {
-      const double sse = (double)inter_mode_data_sse[block_idx][i];
-      const double dist = (double)inter_mode_data_dist[block_idx][i];
-      const double residue_cost = inter_mode_data_residue_cost[block_idx][i];
-      const double ld = (sse - dist) / residue_cost;
-      dist_mean += dist;
-      my += ld;
-      mx += sse;
-      dx += sse * sse;
-      dxy += sse * ld;
-    }
-    dist_mean = dist_mean / data_num;
-    my = my / train_num;
-    mx = mx / train_num;
-    dx = sqrt(dx / train_num);
-    dxy = dxy / train_num;
-
-    md->dist_mean = dist_mean;
-    md->a = (dxy - mx * my) / (dx * dx - mx * mx);
-    md->b = my - md->a * mx;
-    ++md->bracket_idx;
-    md->ready = 1;
-    assert(md->bracket_idx < DATA_BRACKETS);
-
     (void)rdmult;
-#if 0
-    int skip_count = 0;
-    int fp_skip_count = 0;
-    double avg_error = 0;
-    const int test_num = data_num;
-    for (int i = 0; i < data_num; ++i) {
-      const int64_t sse = inter_mode_data_sse[block_idx][i];
-      const int64_t dist = inter_mode_data_dist[block_idx][i];
-      const int64_t residue_cost = inter_mode_data_residue_cost[block_idx][i];
-      const int64_t all_cost = inter_mode_data_all_cost[block_idx][i];
-      const int64_t est_rd =
-          get_est_rd(bsize, rdmult, sse, all_cost - residue_cost);
-      const int64_t real_rd = RDCOST(rdmult, all_cost, dist);
-      const int64_t ref_best_rd = inter_mode_data_ref_best_rd[block_idx][i];
-      if (est_rd > ref_best_rd) {
-        ++skip_count;
-        if (real_rd < ref_best_rd) {
-          ++fp_skip_count;
-        }
-      }
-      avg_error += abs(est_rd - real_rd) * 100. / real_rd;
-    }
-    avg_error /= test_num;
-    printf("test_num %d bsize %d avg_error %f skip_count %d fp_skip_count %d\n",
-           test_num, bsize, avg_error, skip_count, fp_skip_count);
-#endif
   }
 }
 
-static void inter_mode_data_push(BLOCK_SIZE bsize, int64_t sse, int64_t dist,
-                                 int residue_cost, int all_cost,
-                                 int64_t ref_best_rd) {
+static void inter_mode_data_push(TileDataEnc *tile_data, BLOCK_SIZE bsize,
+                                 int64_t sse, int64_t dist, int residue_cost) {
   if (residue_cost == 0 || sse == dist) return;
   const int block_idx = inter_mode_data_block_idx(bsize);
   if (block_idx == -1) return;
-  if (inter_mode_data_idx[block_idx] < INTER_MODE_RD_DATA_OVERALL_SIZE) {
-    const int data_idx = inter_mode_data_idx[block_idx];
-    inter_mode_data_sse[block_idx][data_idx] = sse;
-    inter_mode_data_dist[block_idx][data_idx] = dist;
-    inter_mode_data_residue_cost[block_idx][data_idx] = residue_cost;
-    inter_mode_data_all_cost[block_idx][data_idx] = all_cost;
-    inter_mode_data_ref_best_rd[block_idx][data_idx] = ref_best_rd;
-    ++inter_mode_data_idx[block_idx];
+  InterModeRdModel *rd_model = &tile_data->inter_mode_rd_models[bsize];
+  if (rd_model->num < INTER_MODE_RD_DATA_OVERALL_SIZE) {
+    aom_clear_system_state();
+    const double ld = (sse - dist) * 1. / residue_cost;
+    ++rd_model->num;
+    rd_model->dist_sum += dist;
+    rd_model->ld_sum += ld;
+    rd_model->sse_sum += sse;
+    rd_model->sse_sse_sum += sse * sse;
+    rd_model->sse_ld_sum += sse * ld;
+  }
+}
+
+static void inter_modes_info_push(InterModesInfo *inter_modes_info,
+                                  int mode_rate, int64_t sse, int64_t est_rd,
+                                  const MB_MODE_INFO *mbmi) {
+  const int num = inter_modes_info->num;
+  assert(num < MAX_INTER_MODES);
+  inter_modes_info->mbmi_arr[num] = *mbmi;
+  inter_modes_info->mode_rate_arr[num] = mode_rate;
+  inter_modes_info->sse_arr[num] = sse;
+  inter_modes_info->est_rd_arr[num] = est_rd;
+  ++inter_modes_info->num;
+}
+
+static int compare_rd_idx_pair(const void *a, const void *b) {
+  if (((RdIdxPair *)a)->rd == ((RdIdxPair *)b)->rd) {
+    return 0;
+  } else if (((const RdIdxPair *)a)->rd > ((const RdIdxPair *)b)->rd) {
+    return 1;
+  } else {
+    return -1;
+  }
+}
+
+static void inter_modes_info_sort(const InterModesInfo *inter_modes_info,
+                                  RdIdxPair *rd_idx_pair_arr) {
+  if (inter_modes_info->num == 0) {
+    return;
   }
+  for (int i = 0; i < inter_modes_info->num; ++i) {
+    rd_idx_pair_arr[i].idx = i;
+    rd_idx_pair_arr[i].rd = inter_modes_info->est_rd_arr[i];
+  }
+  qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]),
+        compare_rd_idx_pair);
 }
 #endif  // CONFIG_COLLECT_INTER_MODE_RD_STATS
 
@@ -1528,13 +1626,13 @@ static void score_2D_transform_pow8(float *scores_2D, float shift) {
 // will lead to pruning i+1 TX types on average
 static const float *prune_2D_adaptive_thresholds[] = {
   // TX_4X4
-  (float[]){ 0.02014f, 0.02722f, 0.03430f, 0.04114f, 0.04724f, 0.05212f,
-             0.05627f, 0.06018f, 0.06409f, 0.06824f, 0.07312f, 0.07849f,
-             0.08606f, 0.09827f },
+  (float[]){ 0.00549f, 0.01306f, 0.02039f, 0.02747f, 0.03406f, 0.04065f,
+             0.04724f, 0.05383f, 0.06067f, 0.06799f, 0.07605f, 0.08533f,
+             0.09778f, 0.11780f },
   // TX_8X8
-  (float[]){ 0.00745f, 0.01355f, 0.02039f, 0.02795f, 0.03625f, 0.04407f,
-             0.05042f, 0.05579f, 0.06067f, 0.06604f, 0.07239f, 0.08093f,
-             0.09363f, 0.11682f },
+  (float[]){ 0.00037f, 0.00183f, 0.00525f, 0.01038f, 0.01697f, 0.02502f,
+             0.03381f, 0.04333f, 0.05286f, 0.06287f, 0.07434f, 0.08850f,
+             0.10803f, 0.14124f },
   // TX_16X16
   (float[]){ 0.01404f, 0.02820f, 0.04211f, 0.05164f, 0.05798f, 0.06335f,
              0.06897f, 0.07629f, 0.08875f, 0.11169f },
@@ -1543,35 +1641,37 @@ static const float *prune_2D_adaptive_thresholds[] = {
   // TX_64X64
   NULL,
   // TX_4X8
-  (float[]){ 0.01282f, 0.02087f, 0.02844f, 0.03601f, 0.04285f, 0.04871f,
-             0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f,
-             0.09119f, 0.10828f },
+  (float[]){ 0.00183f, 0.00745f, 0.01428f, 0.02185f, 0.02966f, 0.03723f,
+             0.04456f, 0.05188f, 0.05920f, 0.06702f, 0.07605f, 0.08704f,
+             0.10168f, 0.12585f },
   // TX_8X4
-  (float[]){ 0.01184f, 0.01941f, 0.02722f, 0.03503f, 0.04187f, 0.04822f,
-             0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f,
-             0.09167f, 0.10974f },
+  (float[]){ 0.00085f, 0.00476f, 0.01135f, 0.01892f, 0.02698f, 0.03528f,
+             0.04358f, 0.05164f, 0.05994f, 0.06848f, 0.07849f, 0.09021f,
+             0.10583f, 0.13123f },
   // TX_8X16
-  (float[]){ 0.00525f, 0.01135f, 0.01819f, 0.02576f, 0.03357f, 0.04114f,
-             0.04773f, 0.05383f, 0.05920f, 0.06506f, 0.07190f, 0.08118f,
-             0.09509f, 0.12097f },
+  (float[]){ 0.00037f, 0.00232f, 0.00671f, 0.01257f, 0.01965f, 0.02722f,
+             0.03552f, 0.04382f, 0.05237f, 0.06189f, 0.07336f, 0.08728f,
+             0.10730f, 0.14221f },
   // TX_16X8
-  (float[]){ 0.00525f, 0.01160f, 0.01819f, 0.02527f, 0.03308f, 0.04065f,
-             0.04773f, 0.05383f, 0.05969f, 0.06531f, 0.07214f, 0.08118f,
-             0.09485f, 0.12048f },
+  (float[]){ 0.00061f, 0.00330f, 0.00818f, 0.01453f, 0.02185f, 0.02966f,
+             0.03772f, 0.04578f, 0.05383f, 0.06262f, 0.07288f, 0.08582f,
+             0.10339f, 0.13464f },
   // TX_16X32
-  (float[]){ 0.01257f, 0.02576f, 0.03723f, 0.04578f, 0.05212f, 0.05798f,
-             0.06506f, 0.07385f, 0.08606f, 0.10925f },
+  NULL,
   // TX_32X16
-  (float[]){ 0.01233f, 0.02527f, 0.03699f, 0.04602f, 0.05286f, 0.05896f,
-             0.06531f, 0.07336f, 0.08582f, 0.11072f },
+  NULL,
   // TX_32X64
   NULL,
   // TX_64X32
   NULL,
   // TX_4X16
-  NULL,
+  (float[]){ 0.00232f, 0.00671f, 0.01257f, 0.01941f, 0.02673f, 0.03430f,
+             0.04211f, 0.04968f, 0.05750f, 0.06580f, 0.07507f, 0.08655f,
+             0.10242f, 0.12878f },
   // TX_16X4
-  NULL,
+  (float[]){ 0.00110f, 0.00525f, 0.01208f, 0.01990f, 0.02795f, 0.03601f,
+             0.04358f, 0.05115f, 0.05896f, 0.06702f, 0.07629f, 0.08752f,
+             0.10217f, 0.12610f },
   // TX_8X32
   NULL,
   // TX_32X8
@@ -1631,7 +1731,18 @@ static uint16_t prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
                         cur_scores_2D[3];
   }
   score_2D_average /= 16;
-  score_2D_transform_pow8(scores_2D, (20 - score_2D_average));
+
+  const int prune_aggr_table[2][2] = { { 6, 4 }, { 10, 7 } };
+  int pruning_aggressiveness = 1;
+  if (tx_set_type == EXT_TX_SET_ALL16) {
+    score_2D_transform_pow8(scores_2D, (10 - score_2D_average));
+    pruning_aggressiveness =
+        prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][0];
+  } else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) {
+    score_2D_transform_pow8(scores_2D, (20 - score_2D_average));
+    pruning_aggressiveness =
+        prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][1];
+  }
 
   // Always keep the TX type with the highest score, prune all others with
   // score below score_thresh.
@@ -1645,18 +1756,6 @@ static uint16_t prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
     }
   }
 
-  int pruning_aggressiveness = 0;
-  if (prune_mode == PRUNE_2D_ACCURATE) {
-    if (tx_set_type == EXT_TX_SET_ALL16)
-      pruning_aggressiveness = 6;
-    else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT)
-      pruning_aggressiveness = 4;
-  } else if (prune_mode == PRUNE_2D_FAST) {
-    if (tx_set_type == EXT_TX_SET_ALL16)
-      pruning_aggressiveness = 10;
-    else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT)
-      pruning_aggressiveness = 7;
-  }
   const float score_thresh =
       prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness - 1];
 
@@ -1724,9 +1823,11 @@ static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
 }
 
 static void model_rd_from_sse(const AV1_COMP *const cpi,
-                              const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
-                              int plane, int64_t sse, int *rate,
-                              int64_t *dist) {
+                              const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+                              int plane, int64_t sse, int num_samples,
+                              int *rate, int64_t *dist) {
+  (void)num_samples;
+  const MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int dequant_shift =
       (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
@@ -1734,15 +1835,17 @@ static void model_rd_from_sse(const AV1_COMP *const cpi,
   // Fast approximate the modelling function.
   if (cpi->sf.simple_model_rd_from_var) {
     const int64_t square_error = sse;
-    int quantizer = (pd->dequant_Q3[1] >> dequant_shift);
+    int quantizer = pd->dequant_Q3[1] >> dequant_shift;
     if (quantizer < 120)
-      *rate = (int)((square_error * (280 - quantizer)) >>
-                    (16 - AV1_PROB_COST_SHIFT));
+      *rate = (int)AOMMIN(
+          (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT),
+          INT_MAX);
     else
       *rate = 0;
+    assert(*rate >= 0);
     *dist = (square_error * quantizer) >> 8;
   } else {
-    av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[bsize],
+    av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[plane_bsize],
                                  pd->dequant_Q3[1] >> dequant_shift, rate,
                                  dist);
   }
@@ -1776,22 +1879,23 @@ static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
 
 static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
                             MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
-                            int plane_to, int *out_rate_sum,
-                            int64_t *out_dist_sum, int *skip_txfm_sb,
-                            int64_t *skip_sse_sb, int *plane_rate,
-                            int64_t *plane_sse, int64_t *plane_dist) {
+                            int plane_to, int mi_row, int mi_col,
+                            int *out_rate_sum, int64_t *out_dist_sum,
+                            int *skip_txfm_sb, int64_t *skip_sse_sb,
+                            int *plane_rate, int64_t *plane_sse,
+                            int64_t *plane_dist) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
   int plane;
+  (void)mi_row;
+  (void)mi_col;
   const int ref = xd->mi[0]->ref_frame[0];
 
   int64_t rate_sum = 0;
   int64_t dist_sum = 0;
   int64_t total_sse = 0;
 
-  x->pred_sse[ref] = 0;
-
   for (plane = plane_from; plane <= plane_to; ++plane) {
     struct macroblock_plane *const p = &x->plane[plane];
     struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -1805,26 +1909,31 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
 
     if (x->skip_chroma_rd && plane) continue;
 
-    // TODO(geza): Write direct sse functions that do not compute
-    // variance as well.
-    sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh);
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+                           pd->dst.stride, bw, bh);
+    } else {
+      sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+                    bh);
+    }
     sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
 
+    model_rd_from_sse(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist);
+
     if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
 
     total_sse += sse;
-
-    model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &rate, &dist);
-
     rate_sum += rate;
     dist_sum += dist;
     if (plane_rate) plane_rate[plane] = rate;
     if (plane_sse) plane_sse[plane] = sse;
     if (plane_dist) plane_dist[plane] = dist;
+    assert(rate_sum >= 0);
   }
 
   if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
   if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+  rate_sum = AOMMIN(rate_sum, INT_MAX);
   *out_rate_sum = (int)rate_sum;
   *out_dist_sum = dist_sum;
 }
@@ -1949,7 +2058,7 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
   assert(visible_cols > 0);
 
 #if CONFIG_DIST_8X8
-  if (x->using_dist_8x8 && plane == 0 && txb_cols >= 8 && txb_rows >= 8)
+  if (x->using_dist_8x8 && plane == 0)
     return (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride,
                                   tx_bsize, txb_cols, txb_rows, visible_cols,
                                   visible_rows, x->qindex);
@@ -1967,8 +2076,7 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
 static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
                                       int blk_row, int blk_col,
                                       const BLOCK_SIZE plane_bsize,
-                                      const BLOCK_SIZE tx_bsize,
-                                      int force_sse) {
+                                      const BLOCK_SIZE tx_bsize) {
   int visible_rows, visible_cols;
   const MACROBLOCKD *xd = &x->e_mbd;
   get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
@@ -1978,8 +2086,7 @@ static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
 #if CONFIG_DIST_8X8
   int txb_height = block_size_high[tx_bsize];
   int txb_width = block_size_wide[tx_bsize];
-  if (!force_sse && x->using_dist_8x8 && plane == 0 && txb_width >= 8 &&
-      txb_height >= 8) {
+  if (x->using_dist_8x8 && plane == 0) {
     const int src_stride = x->plane[plane].src.stride;
     const int src_idx = (blk_row * src_stride + blk_col)
                         << tx_size_wide_log2[0];
@@ -2145,29 +2252,7 @@ static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
   av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon,
                               MAX_TX_SIZE, eob,
                               cpi->common.reduced_tx_set_used);
-#if CONFIG_DIST_8X8
-  if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) {
-    // Save decoded pixels for inter block in pd->pred to avoid
-    // block_8x8_rd_txfm_daala_dist() need to produce them
-    // by calling av1_inverse_transform_block() again.
-    const int pred_stride = block_size_wide[plane_bsize];
-    const int pred_idx = (blk_row * pred_stride + blk_col)
-                         << tx_size_wide_log2[0];
-    int16_t *pred = &x->pred_luma[pred_idx];
-    int i, j;
 
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      for (j = 0; j < bsh; j++)
-        for (i = 0; i < bsw; i++)
-          pred[j * pred_stride + i] =
-              CONVERT_TO_SHORTPTR(recon)[j * MAX_TX_SIZE + i];
-    } else {
-      for (j = 0; j < bsh; j++)
-        for (i = 0; i < bsw; i++)
-          pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i];
-    }
-  }
-#endif  // CONFIG_DIST_8X8
   return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
                          blk_row, blk_col, plane_bsize, tx_bsize);
 }
@@ -2258,11 +2343,11 @@ static void get_2x2_normalized_sses_and_sads(
   }
 }
 
+// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
+// 0: Do not collect any RD stats
+// 1: Collect RD stats for transform units
+// 2: Collect RD stats for partition units
 #if CONFIG_COLLECT_RD_STATS
-  // NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
-  // 0: Do not collect any RD stats
-  // 1: Collect RD stats for transform units
-  // 2: Collect RD stats for partition units
 
 #if CONFIG_COLLECT_RD_STATS == 1
 static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -2274,7 +2359,7 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   // Generate small sample to restrict output size.
   static unsigned int seed = 21743;
-  if (lcg_rand16(&seed) % 100 > 0) return;
+  if (lcg_rand16(&seed) % 256 > 0) return;
 
   const char output_file[] = "tu_stats.txt";
   FILE *fout = fopen(output_file, "a");
@@ -2336,7 +2421,8 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   int model_rate;
   int64_t model_dist;
-  model_rd_from_sse(cpi, xd, tx_bsize, plane, sse, &model_rate, &model_dist);
+  model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, tx_bsize, plane, sse, num_samples,
+                                   &model_rate, &model_dist);
   const double model_rate_norm = (double)model_rate / num_samples;
   const double model_dist_norm = (double)model_dist / num_samples;
   fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
@@ -2360,7 +2446,7 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
 }
 #endif  // CONFIG_COLLECT_RD_STATS == 1
 
-#if CONFIG_COLLECT_RD_STATS == 2
+#if CONFIG_COLLECT_RD_STATS >= 2
 static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
                                      const RD_STATS *const rd_stats,
                                      BLOCK_SIZE plane_bsize) {
@@ -2369,7 +2455,7 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   // Generate small sample to restrict output size.
   static unsigned int seed = 95014;
-  if (lcg_rand16(&seed) % 100 > 0) return;
+  if (lcg_rand16(&seed) % 256 > 0) return;
 
   const char output_file[] = "pu_stats.txt";
   FILE *fout = fopen(output_file, "a");
@@ -2390,8 +2476,10 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   const double rate_norm = (double)rd_stats->rate / num_samples;
   const double dist_norm = (double)rd_stats->dist / num_samples;
+  const double rdcost_norm =
+      (double)RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) / num_samples;
 
-  fprintf(fout, "%g %g", rate_norm, dist_norm);
+  fprintf(fout, "%g %g %g", rate_norm, dist_norm, rdcost_norm);
 
   const int src_stride = p->src.stride;
   const uint8_t *const src = p->src.buf;
@@ -2426,14 +2514,18 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
     fprintf(fout, " %g", sad_norm_arr[i]);
   }
 
-  fprintf(fout, " %d %d %d", q_step, bw, bh);
+  fprintf(fout, " %d %d %d %d", q_step, x->rdmult, bw, bh);
 
   int model_rate;
   int64_t model_dist;
-  model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &model_rate, &model_dist);
+  model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, plane_bsize, plane, sse, num_samples,
+                                   &model_rate, &model_dist);
+  const double model_rdcost_norm =
+      (double)RDCOST(x->rdmult, model_rate, model_dist) / num_samples;
   const double model_rate_norm = (double)model_rate / num_samples;
   const double model_dist_norm = (double)model_dist / num_samples;
-  fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
+  fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm,
+          model_rdcost_norm);
 
   double mean = get_mean(src_diff, diff_stride, bw, bh);
   mean /= (1 << shift);
@@ -2450,53 +2542,51 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
   fprintf(fout, "\n");
   fclose(fout);
 }
-#endif  // CONFIG_COLLECT_RD_STATS == 2
+#endif  // CONFIG_COLLECT_RD_STATS >= 2
 #endif  // CONFIG_COLLECT_RD_STATS
 
-static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                              BLOCK_SIZE plane_bsize, int plane, int64_t *rsse,
+static void model_rd_with_dnn(const AV1_COMP *const cpi,
+                              const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+                              int plane, int64_t sse, int num_samples,
                               int *rate, int64_t *dist) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int log_numpels = num_pels_log2_lookup[plane_bsize];
 
+  const int dequant_shift =
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int q_step = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
+
   const struct macroblock_plane *const p = &x->plane[plane];
   int bw, bh;
-  const int diff_stride = block_size_wide[plane_bsize];
   get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
                      &bh);
-  const int num_samples = bw * bh;
-  const int dequant_shift =
-      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
-  const int q_step = pd->dequant_Q3[1] >> dequant_shift;
-
   const int src_stride = p->src.stride;
   const uint8_t *const src = p->src.buf;
   const int dst_stride = pd->dst.stride;
   const uint8_t *const dst = pd->dst.buf;
   const int16_t *const src_diff = p->src_diff;
+  const int diff_stride = block_size_wide[plane_bsize];
   const int shift = (xd->bd - 8);
-  int64_t sse = aom_sum_squares_2d_i16(p->src_diff, diff_stride, bw, bh);
-  sse = ROUND_POWER_OF_TWO(sse, shift * 2);
-  const double sse_norm = (double)sse / num_samples;
 
   if (sse == 0) {
     if (rate) *rate = 0;
     if (dist) *dist = 0;
-    if (rsse) *rsse = sse;
     return;
   }
   if (plane) {
     int model_rate;
     int64_t model_dist;
-    model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &model_rate,
-                      &model_dist);
+    model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, num_samples,
+                          &model_rate, &model_dist);
     if (rate) *rate = model_rate;
     if (dist) *dist = model_dist;
-    if (rsse) *rsse = sse;
     return;
   }
 
+  aom_clear_system_state();
+  const double sse_norm = (double)sse / num_samples;
+
   double sse_norm_arr[4];
   get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
                                    dst_stride, src_diff, diff_stride,
@@ -2506,25 +2596,26 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x,
     for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
     mean /= (1 << shift);
   }
-  const double variance = sse_norm - mean * mean;
-  assert(variance >= 0.0);
+  double sse_norm_sum = 0.0, sse_frac_arr[3];
+  for (int k = 0; k < 4; ++k) sse_norm_sum += sse_norm_arr[k];
+  for (int k = 0; k < 3; ++k)
+    sse_frac_arr[k] =
+        sse_norm_sum > 0.0 ? sse_norm_arr[k] / sse_norm_sum : 0.25;
   const double q_sqr = (double)(q_step * q_step);
   const double q_sqr_by_sse_norm = q_sqr / (sse_norm + 1.0);
+  const double mean_sqr_by_sse_norm = mean * mean / (sse_norm + 1.0);
   double hor_corr, vert_corr;
   get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr);
 
-  float features[11];
+  float features[NUM_FEATURES_PUSTATS];
   features[0] = (float)hor_corr;
   features[1] = (float)log_numpels;
-  features[2] = (float)q_sqr;
+  features[2] = (float)mean_sqr_by_sse_norm;
   features[3] = (float)q_sqr_by_sse_norm;
-  features[4] = (float)sse_norm_arr[0];
-  features[5] = (float)sse_norm_arr[1];
-  features[6] = (float)sse_norm_arr[2];
-  features[7] = (float)sse_norm_arr[3];
-  features[8] = (float)sse_norm;
-  features[9] = (float)variance;
-  features[10] = (float)vert_corr;
+  features[4] = (float)sse_frac_arr[0];
+  features[5] = (float)sse_frac_arr[1];
+  features[6] = (float)sse_frac_arr[2];
+  features[7] = (float)vert_corr;
 
   float rate_f, dist_by_sse_norm_f;
   av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_by_sse_norm_f);
@@ -2532,27 +2623,29 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x,
   const float dist_f = (float)((double)dist_by_sse_norm_f * (1.0 + sse_norm));
   int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
   int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+  aom_clear_system_state();
 
   // Check if skip is better
-  if (RDCOST(x->rdmult, rate_i, dist_i) >= RDCOST(x->rdmult, 0, (sse << 4))) {
+  if (rate_i == 0) {
     dist_i = sse << 4;
+  } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+             RDCOST(x->rdmult, 0, sse << 4)) {
     rate_i = 0;
-  } else if (rate_i == 0) {
     dist_i = sse << 4;
   }
 
   if (rate) *rate = rate_i;
   if (dist) *dist = dist_i;
-  if (rsse) *rsse = sse;
   return;
 }
 
-void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
-                              MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
-                              int plane_to, int *out_rate_sum,
-                              int64_t *out_dist_sum, int *skip_txfm_sb,
-                              int64_t *skip_sse_sb, int *plane_rate,
-                              int64_t *plane_sse, int64_t *plane_dist) {
+static void model_rd_for_sb_with_dnn(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+  (void)mi_row;
+  (void)mi_col;
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -2562,19 +2655,30 @@ void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
   int64_t dist_sum = 0;
   int64_t total_sse = 0;
 
-  x->pred_sse[ref] = 0;
-
   for (int plane = plane_from; plane <= plane_to; ++plane) {
     struct macroblockd_plane *const pd = &xd->plane[plane];
     const BLOCK_SIZE plane_bsize =
         get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
-    int64_t sse;
+    int64_t dist, sse;
     int rate;
-    int64_t dist;
 
     if (x->skip_chroma_rd && plane) continue;
 
-    model_rd_with_dnn(cpi, x, plane_bsize, plane, &sse, &rate, &dist);
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const int shift = (xd->bd - 8);
+    int bw, bh;
+    get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+                       &bw, &bh);
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+                           pd->dst.stride, bw, bh);
+    } else {
+      sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+                    bh);
+    }
+    sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+
+    model_rd_with_dnn(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist);
 
     if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
 
@@ -2593,110 +2697,385 @@ void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
   *out_dist_sum = dist_sum;
 }
 
-static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                               int block, int blk_row, int blk_col,
-                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                               const TXB_CTX *const txb_ctx,
-                               FAST_TX_SEARCH_MODE ftxs_mode,
-                               int use_fast_coef_costing, int64_t ref_best_rd,
-                               RD_STATS *best_rd_stats) {
-  const AV1_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  const int is_inter = is_inter_block(mbmi);
-  int64_t best_rd = INT64_MAX;
-  uint16_t best_eob = 0;
-  TX_TYPE best_tx_type = DCT_DCT;
-  TX_TYPE last_tx_type = TX_TYPES;
-  const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY;
-  // The buffer used to swap dqcoeff in macroblockd_plane so we can keep dqcoeff
-  // of the best tx_type
-  DECLARE_ALIGNED(32, tran_low_t, this_dqcoeff[MAX_SB_SQUARE]);
-  tran_low_t *orig_dqcoeff = pd->dqcoeff;
-  tran_low_t *best_dqcoeff = this_dqcoeff;
-  const int txk_type_idx =
-      av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
-  av1_invalid_rd_stats(best_rd_stats);
+// Fits a surface for rate and distortion using as features:
+// log2(sse_norm + 1) and log2(sse_norm/qstep^2)
+static void model_rd_with_surffit(const AV1_COMP *const cpi,
+                                  const MACROBLOCK *const x,
+                                  BLOCK_SIZE plane_bsize, int plane,
+                                  int64_t sse, int num_samples, int *rate,
+                                  int64_t *dist) {
+  (void)cpi;
+  (void)plane_bsize;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int dequant_shift =
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
+  if (sse == 0) {
+    if (rate) *rate = 0;
+    if (dist) *dist = 0;
+    return;
+  }
+  aom_clear_system_state();
+  const double sse_norm = (double)sse / num_samples;
+  const double qstepsqr = (double)qstep * qstep;
+  const double xm = log(sse_norm + 1.0) / log(2.0);
+  const double yl = log(sse_norm / qstepsqr) / log(2.0);
+  double rate_f, dist_by_sse_norm_f;
 
-  TXB_RD_INFO *intra_txb_rd_info = NULL;
-  uint16_t cur_joint_ctx = 0;
-  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
-  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
-  const int within_border =
-      mi_row >= xd->tile.mi_row_start &&
-      (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) &&
-      mi_col >= xd->tile.mi_col_start &&
-      (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end);
-  if (within_border && cpi->sf.use_intra_txb_hash && frame_is_intra_only(cm) &&
-      !is_inter && plane == 0 &&
-      tx_size_wide[tx_size] == tx_size_high[tx_size]) {
-    const uint32_t intra_hash =
-        get_intra_txb_hash(x, plane, blk_row, blk_col, plane_bsize, tx_size);
-    const int intra_hash_idx =
-        find_tx_size_rd_info(&x->txb_rd_record_intra, intra_hash);
-    intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx];
+  av1_model_rd_surffit(xm, yl, &rate_f, &dist_by_sse_norm_f);
 
-    cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
-    if (intra_hash_idx > 0 &&
-        intra_txb_rd_info->entropy_context == cur_joint_ctx &&
-        x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) {
-      mbmi->txk_type[txk_type_idx] = intra_txb_rd_info->tx_type;
-      const TX_TYPE ref_tx_type =
-          av1_get_tx_type(get_plane_type(plane), &x->e_mbd, blk_row, blk_col,
-                          tx_size, cpi->common.reduced_tx_set_used);
-      if (ref_tx_type == intra_txb_rd_info->tx_type) {
-        best_rd_stats->rate = intra_txb_rd_info->rate;
-        best_rd_stats->dist = intra_txb_rd_info->dist;
-        best_rd_stats->sse = intra_txb_rd_info->sse;
-        best_rd_stats->skip = intra_txb_rd_info->eob == 0;
-        x->plane[plane].eobs[block] = intra_txb_rd_info->eob;
-        x->plane[plane].txb_entropy_ctx[block] =
-            intra_txb_rd_info->txb_entropy_ctx;
-        best_rd = RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->dist);
-        best_eob = intra_txb_rd_info->eob;
-        best_tx_type = intra_txb_rd_info->tx_type;
-        update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
-                         best_tx_type);
-        goto RECON_INTRA;
-      }
-    }
-  }
+  const double dist_f = dist_by_sse_norm_f * sse_norm;
+  int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+  int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+  aom_clear_system_state();
 
-  int rate_cost = 0;
-  TX_TYPE txk_start = DCT_DCT;
-  TX_TYPE txk_end = TX_TYPES - 1;
-  if ((!is_inter && x->use_default_intra_tx_type) ||
-      (is_inter && x->use_default_inter_tx_type)) {
-    txk_start = txk_end = get_default_tx_type(0, xd, tx_size);
-  } else if (x->rd_model == LOW_TXFM_RD || x->cb_partition_scan) {
-    if (plane == 0) txk_end = DCT_DCT;
+  // Check if skip is better
+  if (rate_i == 0) {
+    dist_i = sse << 4;
+  } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+             RDCOST(x->rdmult, 0, sse << 4)) {
+    rate_i = 0;
+    dist_i = sse << 4;
   }
 
-  uint8_t best_txb_ctx = 0;
-  const TxSetType tx_set_type =
-      av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
+  if (rate) *rate = rate_i;
+  if (dist) *dist = dist_i;
+}
 
-  TX_TYPE uv_tx_type = DCT_DCT;
-  if (plane) {
-    // tx_type of PLANE_TYPE_UV should be the same as PLANE_TYPE_Y
-    uv_tx_type = txk_start = txk_end =
-        av1_get_tx_type(get_plane_type(plane), xd, blk_row, blk_col, tx_size,
-                        cm->reduced_tx_set_used);
-  }
-  const uint16_t ext_tx_used_flag = av1_ext_tx_used_flag[tx_set_type];
-  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 ||
-      ext_tx_used_flag == 0x0001) {
-    txk_start = txk_end = DCT_DCT;
-  }
-  uint16_t allowed_tx_mask = 0;  // 1: allow; 0: skip.
-  if (txk_start == txk_end) {
-    allowed_tx_mask = 1 << txk_start;
-    allowed_tx_mask &= ext_tx_used_flag;
-  } else if (fast_tx_search) {
-    allowed_tx_mask = 0x0c01;  // V_DCT, H_DCT, DCT_DCT
-    allowed_tx_mask &= ext_tx_used_flag;
-  } else {
+static void model_rd_for_sb_with_surffit(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+  (void)mi_row;
+  (void)mi_col;
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  const int ref = xd->mi[0]->ref_frame[0];
+
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  int64_t total_sse = 0;
+
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    int64_t dist, sse;
+    int rate;
+
+    if (x->skip_chroma_rd && plane) continue;
+
+    int bw, bh;
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const int shift = (xd->bd - 8);
+    get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+                       &bw, &bh);
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+                           pd->dst.stride, bw, bh);
+    } else {
+      sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+                    bh);
+    }
+    sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+
+    model_rd_with_surffit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+                          &dist);
+
+    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+    total_sse += sse;
+    rate_sum += rate;
+    dist_sum += dist;
+
+    if (plane_rate) plane_rate[plane] = rate;
+    if (plane_sse) plane_sse[plane] = sse;
+    if (plane_dist) plane_dist[plane] = dist;
+  }
+
+  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
+// Fits a curve for rate and distortion using as feature:
+// log2(sse_norm/qstep^2)
+static void model_rd_with_curvfit(const AV1_COMP *const cpi,
+                                  const MACROBLOCK *const x,
+                                  BLOCK_SIZE plane_bsize, int plane,
+                                  int64_t sse, int num_samples, int *rate,
+                                  int64_t *dist) {
+  (void)cpi;
+  (void)plane_bsize;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int dequant_shift =
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
+
+  if (sse == 0) {
+    if (rate) *rate = 0;
+    if (dist) *dist = 0;
+    return;
+  }
+  aom_clear_system_state();
+  const double sse_norm = (double)sse / num_samples;
+  const double qstepsqr = (double)qstep * qstep;
+  const double xqr = log(sse_norm / qstepsqr) / log(2.0);
+
+  double rate_f, dist_by_sse_norm_f;
+  av1_model_rd_curvfit(xqr, &rate_f, &dist_by_sse_norm_f);
+
+  const double dist_f = dist_by_sse_norm_f * sse_norm;
+  int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+  int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+  aom_clear_system_state();
+
+  // Check if skip is better
+  if (rate_i == 0) {
+    dist_i = sse << 4;
+  } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+             RDCOST(x->rdmult, 0, sse << 4)) {
+    rate_i = 0;
+    dist_i = sse << 4;
+  }
+
+  if (rate) *rate = rate_i;
+  if (dist) *dist = dist_i;
+}
+
+static void model_rd_for_sb_with_curvfit(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+  (void)mi_row;
+  (void)mi_col;
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  const int ref = xd->mi[0]->ref_frame[0];
+
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  int64_t total_sse = 0;
+
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    int64_t dist, sse;
+    int rate;
+
+    if (x->skip_chroma_rd && plane) continue;
+
+    int bw, bh;
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const int shift = (xd->bd - 8);
+    get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+                       &bw, &bh);
+
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+                           pd->dst.stride, bw, bh);
+    } else {
+      sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+                    bh);
+    }
+
+    sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+    model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+                          &dist);
+
+    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+    total_sse += sse;
+    rate_sum += rate;
+    dist_sum += dist;
+
+    if (plane_rate) plane_rate[plane] = rate;
+    if (plane_sse) plane_sse[plane] = sse;
+    if (plane_dist) plane_dist[plane] = dist;
+  }
+
+  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
+static void model_rd_for_sb_with_fullrdy(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+  const int ref = xd->mi[0]->ref_frame[0];
+
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  int64_t total_sse = 0;
+
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    struct macroblock_plane *const p = &x->plane[plane];
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+    int64_t sse;
+    int rate;
+    int64_t dist;
+
+    if (x->skip_chroma_rd && plane) continue;
+
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+                           pd->dst.stride, bw, bh);
+    } else {
+      sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+                    bh);
+    }
+    sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+
+    RD_STATS rd_stats;
+    if (plane == 0) {
+      select_tx_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, INT64_MAX);
+      if (rd_stats.invalid_rate) {
+        rate = 0;
+        dist = sse << 4;
+      } else {
+        rate = rd_stats.rate;
+        dist = rd_stats.dist;
+      }
+    } else {
+      model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+                            &dist);
+    }
+
+    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+    total_sse += sse;
+    rate_sum += rate;
+    dist_sum += dist;
+
+    if (plane_rate) plane_rate[plane] = rate;
+    if (plane_sse) plane_sse[plane] = sse;
+    if (plane_dist) plane_dist[plane] = dist;
+  }
+
+  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
+static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                               int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               const TXB_CTX *const txb_ctx,
+                               FAST_TX_SEARCH_MODE ftxs_mode,
+                               int use_fast_coef_costing, int64_t ref_best_rd,
+                               RD_STATS *best_rd_stats) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  int64_t best_rd = INT64_MAX;
+  uint16_t best_eob = 0;
+  TX_TYPE best_tx_type = DCT_DCT;
+  TX_TYPE last_tx_type = TX_TYPES;
+  const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY;
+  // The buffer used to swap dqcoeff in macroblockd_plane so we can keep dqcoeff
+  // of the best tx_type
+  DECLARE_ALIGNED(32, tran_low_t, this_dqcoeff[MAX_SB_SQUARE]);
+  tran_low_t *orig_dqcoeff = pd->dqcoeff;
+  tran_low_t *best_dqcoeff = this_dqcoeff;
+  const int txk_type_idx =
+      av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
+  av1_invalid_rd_stats(best_rd_stats);
+
+  TXB_RD_INFO *intra_txb_rd_info = NULL;
+  uint16_t cur_joint_ctx = 0;
+  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+  const int within_border =
+      mi_row >= xd->tile.mi_row_start &&
+      (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) &&
+      mi_col >= xd->tile.mi_col_start &&
+      (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end);
+  if (within_border && cpi->sf.use_intra_txb_hash && frame_is_intra_only(cm) &&
+      !is_inter && plane == 0 &&
+      tx_size_wide[tx_size] == tx_size_high[tx_size]) {
+    const uint32_t intra_hash =
+        get_intra_txb_hash(x, plane, blk_row, blk_col, plane_bsize, tx_size);
+    const int intra_hash_idx =
+        find_tx_size_rd_info(&x->txb_rd_record_intra, intra_hash);
+    intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx];
+
+    cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
+    if (intra_txb_rd_info->entropy_context == cur_joint_ctx &&
+        x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) {
+      mbmi->txk_type[txk_type_idx] = intra_txb_rd_info->tx_type;
+      const TX_TYPE ref_tx_type =
+          av1_get_tx_type(get_plane_type(plane), &x->e_mbd, blk_row, blk_col,
+                          tx_size, cpi->common.reduced_tx_set_used);
+      if (ref_tx_type == intra_txb_rd_info->tx_type) {
+        best_rd_stats->rate = intra_txb_rd_info->rate;
+        best_rd_stats->dist = intra_txb_rd_info->dist;
+        best_rd_stats->sse = intra_txb_rd_info->sse;
+        best_rd_stats->skip = intra_txb_rd_info->eob == 0;
+        x->plane[plane].eobs[block] = intra_txb_rd_info->eob;
+        x->plane[plane].txb_entropy_ctx[block] =
+            intra_txb_rd_info->txb_entropy_ctx;
+        best_rd = RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->dist);
+        best_eob = intra_txb_rd_info->eob;
+        best_tx_type = intra_txb_rd_info->tx_type;
+        update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                         best_tx_type);
+        goto RECON_INTRA;
+      }
+    }
+  }
+
+  int rate_cost = 0;
+  TX_TYPE txk_start = DCT_DCT;
+  TX_TYPE txk_end = TX_TYPES - 1;
+  if ((!is_inter && x->use_default_intra_tx_type) ||
+      (is_inter && x->use_default_inter_tx_type)) {
+    txk_start = txk_end = get_default_tx_type(0, xd, tx_size);
+  } else if (x->rd_model == LOW_TXFM_RD || x->cb_partition_scan) {
+    if (plane == 0) txk_end = DCT_DCT;
+  }
+
+  uint8_t best_txb_ctx = 0;
+  const TxSetType tx_set_type =
+      av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
+
+  TX_TYPE uv_tx_type = DCT_DCT;
+  if (plane) {
+    // tx_type of PLANE_TYPE_UV should be the same as PLANE_TYPE_Y
+    uv_tx_type = txk_start = txk_end =
+        av1_get_tx_type(get_plane_type(plane), xd, blk_row, blk_col, tx_size,
+                        cm->reduced_tx_set_used);
+  }
+  const uint16_t ext_tx_used_flag = av1_ext_tx_used_flag[tx_set_type];
+  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 ||
+      ext_tx_used_flag == 0x0001) {
+    txk_start = txk_end = DCT_DCT;
+  }
+  uint16_t allowed_tx_mask = 0;  // 1: allow; 0: skip.
+  if (txk_start == txk_end) {
+    allowed_tx_mask = 1 << txk_start;
+    allowed_tx_mask &= ext_tx_used_flag;
+  } else if (fast_tx_search) {
+    allowed_tx_mask = 0x0c01;  // V_DCT, H_DCT, DCT_DCT
+    allowed_tx_mask &= ext_tx_used_flag;
+  } else {
     assert(plane == 0);
     allowed_tx_mask = ext_tx_used_flag;
     // !fast_tx_search && txk_end != txk_start && plane == 0
@@ -2727,7 +3106,6 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
 #if CONFIG_DIST_8X8
   if (x->using_dist_8x8) use_transform_domain_distortion = 0;
 #endif
-
   int calc_pixel_domain_distortion_final =
       cpi->sf.use_transform_domain_distortion == 1 &&
       use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD &&
@@ -2740,7 +3118,7 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
 
   const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
   int64_t block_sse =
-      pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize, 1);
+      pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize);
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
     block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
   block_sse *= 16;
@@ -2834,7 +3212,6 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
   assert(best_rd != INT64_MAX);
 
   best_rd_stats->skip = best_eob == 0;
-  if (best_eob == 0) best_tx_type = DCT_DCT;
   if (plane == 0) {
     update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
                      best_tx_type);
@@ -2914,24 +3291,12 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   int64_t rd1, rd2, rd;
   RD_STATS this_rd_stats;
 
-#if CONFIG_DIST_8X8
-  // If sub8x8 tx, 8x8 or larger partition, and luma channel,
-  // dist-8x8 disables early skip, because the distortion metrics for
-  // sub8x8 tx (MSE) and reference distortion from 8x8 or larger partition
-  // (new distortion metric) are different.
-  // Exception is: dist-8x8 is enabled but still MSE is used,
-  // i.e. "--tune=" encoder option is not used.
-  int bw = block_size_wide[plane_bsize];
-  int bh = block_size_high[plane_bsize];
-  int disable_early_skip =
-      x->using_dist_8x8 && plane == AOM_PLANE_Y && bw >= 8 && bh >= 8 &&
-      (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) &&
-      x->tune_metric != AOM_TUNE_PSNR;
-#endif  // CONFIG_DIST_8X8
-
   av1_init_rd_stats(&this_rd_stats);
 
-  if (args->exit_early) return;
+  if (args->exit_early) {
+    args->incomplete_exit = 1;
+    return;
+  }
 
   if (!is_inter_block(mbmi)) {
     av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
@@ -2954,11 +3319,14 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
 #endif  // CONFIG_RD_DEBUG
   av1_set_txb_context(x, plane, block, tx_size, a, l);
 
-  if (plane == 0) {
-    x->blk_skip[blk_row *
-                    (block_size_wide[plane_bsize] >> tx_size_wide_log2[0]) +
-                blk_col] = (x->plane[plane].eobs[block] == 0);
-  }
+  const int blk_idx =
+      blk_row * (block_size_wide[plane_bsize] >> tx_size_wide_log2[0]) +
+      blk_col;
+
+  if (plane == 0)
+    set_blk_skip(x, plane, blk_idx, x->plane[plane].eobs[block] == 0);
+  else
+    set_blk_skip(x, plane, blk_idx, 0);
 
   rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
   rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse);
@@ -2972,100 +3340,11 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
 
   args->this_rd += rd;
 
-#if CONFIG_DIST_8X8
-  if (!disable_early_skip)
-#endif
-    if (args->this_rd > args->best_rd) {
-      args->exit_early = 1;
-      return;
-    }
-}
-
-#if CONFIG_DIST_8X8
-static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                    BLOCK_SIZE bsize,
-                                    struct rdcost_block_args *args) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblockd_plane *const pd = &xd->plane[0];
-  const struct macroblock_plane *const p = &x->plane[0];
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  const int src_stride = p->src.stride;
-  const int dst_stride = pd->dst.stride;
-  const uint8_t *src = &p->src.buf[0];
-  const uint8_t *dst = &pd->dst.buf[0];
-  const int16_t *pred = &x->pred_luma[0];
-  int bw = block_size_wide[bsize];
-  int bh = block_size_high[bsize];
-  int visible_w = bw;
-  int visible_h = bh;
-
-  int i, j;
-  int64_t rd, rd1, rd2;
-  int64_t sse = INT64_MAX, dist = INT64_MAX;
-  int qindex = x->qindex;
-
-  assert((bw & 0x07) == 0);
-  assert((bh & 0x07) == 0);
-
-  get_txb_dimensions(xd, 0, bsize, 0, 0, bsize, &bw, &bh, &visible_w,
-                     &visible_h);
-
-  const int diff_stride = block_size_wide[bsize];
-  const int16_t *diff = p->src_diff;
-  sse = dist_8x8_diff(x, src, src_stride, diff, diff_stride, bw, bh, visible_w,
-                      visible_h, qindex);
-  sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
-  sse *= 16;
-
-  if (!is_inter_block(mbmi)) {
-    dist = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, bsize, bw, bh,
-                        visible_w, visible_h, qindex);
-    dist *= 16;
-  } else {
-    // For inter mode, the decoded pixels are provided in x->pred_luma,
-    // while the predicted pixels are in dst.
-    uint8_t *pred8;
-    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
-
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      pred8 = CONVERT_TO_BYTEPTR(pred16);
-    else
-      pred8 = (uint8_t *)pred16;
-
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      for (j = 0; j < bh; j++)
-        for (i = 0; i < bw; i++)
-          CONVERT_TO_SHORTPTR(pred8)[j * bw + i] = pred[j * bw + i];
-    } else {
-      for (j = 0; j < bh; j++)
-        for (i = 0; i < bw; i++) pred8[j * bw + i] = (uint8_t)pred[j * bw + i];
-    }
-
-    dist = av1_dist_8x8(cpi, x, src, src_stride, pred8, bw, bsize, bw, bh,
-                        visible_w, visible_h, qindex);
-    dist *= 16;
-  }
-
-#ifdef DEBUG_DIST_8X8
-  if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) {
-    assert(args->rd_stats.sse == sse);
-    assert(args->rd_stats.dist == dist);
+  if (args->this_rd > args->best_rd) {
+    args->exit_early = 1;
+    return;
   }
-#endif  // DEBUG_DIST_8X8
-
-  args->rd_stats.sse = sse;
-  args->rd_stats.dist = dist;
-
-  rd1 = RDCOST(x->rdmult, args->rd_stats.rate, args->rd_stats.dist);
-  rd2 = RDCOST(x->rdmult, 0, args->rd_stats.sse);
-  rd = AOMMIN(rd1, rd2);
-
-  args->rd_stats.rdcost = rd;
-  args->this_rd = rd;
-
-  if (args->this_rd > args->best_rd) args->exit_early = 1;
 }
-#endif  // CONFIG_DIST_8X8
 
 static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
                              RD_STATS *rd_stats, int64_t ref_best_rd, int plane,
@@ -3089,16 +3368,12 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
 
   av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
                                          &args);
-#if CONFIG_DIST_8X8
-  int bw = block_size_wide[bsize];
-  int bh = block_size_high[bsize];
 
-  if (x->using_dist_8x8 && !args.exit_early && plane == 0 && bw >= 8 &&
-      bh >= 8 && (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
-    dist_8x8_sub8x8_txfm_rd(cpi, x, bsize, &args);
-#endif
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  const int invalid_rd = is_inter ? args.incomplete_exit : args.exit_early;
 
-  if (args.exit_early) {
+  if (invalid_rd) {
     av1_invalid_rd_stats(rd_stats);
   } else {
     *rd_stats = args.rd_stats;
@@ -3269,6 +3544,11 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
   prune_tx(cpi, bs, x, xd, EXT_TX_SET_ALL16);
 
   for (n = start_tx; depth <= MAX_TX_DEPTH; depth++, n = sub_tx_size_map[n]) {
+#if CONFIG_DIST_8X8
+    if (x->using_dist_8x8) {
+      if (tx_size_wide[n] < 8 || tx_size_high[n] < 8) continue;
+    }
+#endif
     RD_STATS this_rd_stats;
     if (mbmi->ref_mv_idx > 0) x->rd_model = LOW_TXFM_RD;
     rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, n, FTXS_NONE);
@@ -3284,10 +3564,13 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
     }
     if (n == TX_4X4) break;
   }
-  mbmi->tx_size = best_tx_size;
-  memcpy(mbmi->txk_type, best_txk_type,
-         sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
-  memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+
+  if (rd_stats->rate != INT_MAX) {
+    mbmi->tx_size = best_tx_size;
+    memcpy(mbmi->txk_type, best_txk_type,
+           sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
+    memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+  }
 
   // Reset the pruning flags.
   av1_zero(x->tx_search_prune);
@@ -3429,7 +3712,8 @@ static int conditional_skipintra(PREDICTION_MODE mode,
 
 // Model based RD estimation for luma intra blocks.
 static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                               BLOCK_SIZE bsize, int mode_cost) {
+                               BLOCK_SIZE bsize, int mode_cost, int mi_row,
+                               int mi_col) {
   const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -3450,10 +3734,9 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
     }
   }
   // RD estimation.
-  av1_subtract_plane(x, bsize, 0);
-  model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate,
-                  &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL,
-                  NULL, NULL);
+  model_rd_sb_fn[MODELRD_TYPE_INTRA](
+      cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &this_rd_stats.rate,
+      &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL, NULL, NULL);
   if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
     mode_cost +=
         x->angle_delta_cost[mbmi->mode - V_PRED]
@@ -3519,13 +3802,16 @@ static void optimize_palette_colors(uint16_t *color_cache, int n_cache,
 
 // Given the base colors as specified in centroids[], calculate the RD cost
 // of palette mode.
-static void palette_rd_y(
-    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
-    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n,
-    uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
-    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
-    int *rate, int *rate_tokenonly, int *rate_overhead, int64_t *distortion,
-    int *skippable, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip) {
+static void palette_rd_y(const AV1_COMP *const cpi, MACROBLOCK *x,
+                         MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, int mi_row,
+                         int mi_col, int dc_mode_cost, const int *data,
+                         int *centroids, int n, uint16_t *color_cache,
+                         int n_cache, MB_MODE_INFO *best_mbmi,
+                         uint8_t *best_palette_color_map, int64_t *best_rd,
+                         int64_t *best_model_rd, int *rate, int *rate_tokenonly,
+                         int *rate_overhead, int64_t *distortion,
+                         int *skippable, PICK_MODE_CONTEXT *ctx,
+                         uint8_t *blk_skip) {
   optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
   int k = av1_remove_duplicates(centroids, n);
   if (k < PALETTE_MIN_SIZE) {
@@ -3551,7 +3837,8 @@ static void palette_rd_y(
   extend_palette_color_map(color_map, cols, rows, block_width, block_height);
   const int palette_mode_cost =
       intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
-  int64_t this_model_rd = intra_model_yrd(cpi, x, bsize, palette_mode_cost);
+  int64_t this_model_rd =
+      intra_model_yrd(cpi, x, bsize, palette_mode_cost, mi_row, mi_col);
   if (*best_model_rd != INT64_MAX &&
       this_model_rd > *best_model_rd + (*best_model_rd >> 1))
     return;
@@ -3580,11 +3867,11 @@ static void palette_rd_y(
 }
 
 static int rd_pick_palette_intra_sby(
-    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-    int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
-    int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly,
-    int64_t *distortion, int *skippable, PICK_MODE_CONTEXT *ctx,
-    uint8_t *best_blk_skip) {
+    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+    int mi_col, int dc_mode_cost, MB_MODE_INFO *best_mbmi,
+    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
+    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+    PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip) {
   int rate_overhead = 0;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -3668,10 +3955,11 @@ static int rd_pick_palette_intra_sby(
     // where the dominant colors and the k-means results are similar.
     for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) {
       for (i = 0; i < n; ++i) centroids[i] = top_colors[i];
-      palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
-                   color_cache, n_cache, best_mbmi, best_palette_color_map,
-                   best_rd, best_model_rd, rate, rate_tokenonly, &rate_overhead,
-                   distortion, skippable, ctx, best_blk_skip);
+      palette_rd_y(cpi, x, mbmi, bsize, mi_row, mi_col, dc_mode_cost, data,
+                   centroids, n, color_cache, n_cache, best_mbmi,
+                   best_palette_color_map, best_rd, best_model_rd, rate,
+                   rate_tokenonly, &rate_overhead, distortion, skippable, ctx,
+                   best_blk_skip);
     }
 
     // K-means clustering.
@@ -3688,10 +3976,11 @@ static int rd_pick_palette_intra_sby(
         }
         av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr);
       }
-      palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
-                   color_cache, n_cache, best_mbmi, best_palette_color_map,
-                   best_rd, best_model_rd, rate, rate_tokenonly, &rate_overhead,
-                   distortion, skippable, ctx, best_blk_skip);
+      palette_rd_y(cpi, x, mbmi, bsize, mi_row, mi_col, dc_mode_cost, data,
+                   centroids, n, color_cache, n_cache, best_mbmi,
+                   best_palette_color_map, best_rd, best_model_rd, rate,
+                   rate_tokenonly, &rate_overhead, distortion, skippable, ctx,
+                   best_blk_skip);
     }
   }
 
@@ -3705,10 +3994,11 @@ static int rd_pick_palette_intra_sby(
 
 // Return 1 if an filter intra mode is selected; return 0 otherwise.
 static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                    int *rate, int *rate_tokenonly,
-                                    int64_t *distortion, int *skippable,
-                                    BLOCK_SIZE bsize, int mode_cost,
-                                    int64_t *best_rd, int64_t *best_model_rd,
+                                    int mi_row, int mi_col, int *rate,
+                                    int *rate_tokenonly, int64_t *distortion,
+                                    int *skippable, BLOCK_SIZE bsize,
+                                    int mode_cost, int64_t *best_rd,
+                                    int64_t *best_model_rd,
                                     PICK_MODE_CONTEXT *ctx) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
@@ -3727,7 +4017,7 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
     int64_t this_rd, this_model_rd;
     RD_STATS tokenonly_rd_stats;
     mbmi->filter_intra_mode_info.filter_intra_mode = mode;
-    this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
+    this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost, mi_row, mi_col);
     if (*best_model_rd != INT64_MAX &&
         this_model_rd > *best_model_rd + (*best_model_rd >> 1))
       continue;
@@ -3770,20 +4060,18 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
 // Run RD calculation with given luma intra prediction angle., and return
 // the RD cost. Update the best mode info. if the RD cost is the best so far.
 static int64_t calc_rd_given_intra_angle(
-    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost,
-    int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate,
-    RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size,
-    int64_t *best_rd, int64_t *best_model_rd, TX_TYPE *best_txk_type,
-    uint8_t *best_blk_skip) {
-  int this_rate;
+    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+    int mi_col, int mode_cost, int64_t best_rd_in, int8_t angle_delta,
+    int max_angle_delta, int *rate, RD_STATS *rd_stats, int *best_angle_delta,
+    TX_SIZE *best_tx_size, int64_t *best_rd, int64_t *best_model_rd,
+    TX_TYPE *best_txk_type, uint8_t *best_blk_skip) {
   RD_STATS tokenonly_rd_stats;
   int64_t this_rd, this_model_rd;
   MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
   const int n4 = bsize_to_num_blk(bsize);
   assert(!is_inter_block(mbmi));
-
   mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta;
-  this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
+  this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost, mi_row, mi_col);
   if (*best_model_rd != INT64_MAX &&
       this_model_rd > *best_model_rd + (*best_model_rd >> 1))
     return INT64_MAX;
@@ -3791,10 +4079,9 @@ static int64_t calc_rd_given_intra_angle(
   super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in);
   if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX;
 
-  this_rate =
-      tokenonly_rd_stats.rate + mode_cost +
-      x->angle_delta_cost[mbmi->mode - V_PRED]
-                         [max_angle_delta + mbmi->angle_delta[PLANE_TYPE_Y]];
+  int this_rate =
+      mode_cost + tokenonly_rd_stats.rate +
+      x->angle_delta_cost[mbmi->mode - V_PRED][max_angle_delta + angle_delta];
   this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
 
   if (this_rd < *best_rd) {
@@ -3815,32 +4102,32 @@ static int64_t calc_rd_given_intra_angle(
 // With given luma directional intra prediction mode, pick the best angle delta
 // Return the RD cost corresponding to the best angle delta.
 static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                       int *rate, RD_STATS *rd_stats,
-                                       BLOCK_SIZE bsize, int mode_cost,
-                                       int64_t best_rd,
+                                       int mi_row, int mi_col, int *rate,
+                                       RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                       int mode_cost, int64_t best_rd,
                                        int64_t *best_model_rd) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
+  MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
   assert(!is_inter_block(mbmi));
-  int i, angle_delta, best_angle_delta = 0;
-  int first_try = 1;
-  int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
+
+  int best_angle_delta = 0;
+  int64_t rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
   TX_SIZE best_tx_size = mbmi->tx_size;
-  const int n4 = bsize_to_num_blk(bsize);
   TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
 
-  for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+  for (int i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
 
-  for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
-    for (i = 0; i < 2; ++i) {
-      best_rd_in = (best_rd == INT64_MAX)
-                       ? INT64_MAX
-                       : (best_rd + (best_rd >> (first_try ? 3 : 5)));
-      this_rd = calc_rd_given_intra_angle(
-          cpi, x, bsize, mode_cost, best_rd_in, (1 - 2 * i) * angle_delta,
-          MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
-          &best_rd, best_model_rd, best_txk_type, best_blk_skip);
+  int first_try = 1;
+  for (int angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+    for (int i = 0; i < 2; ++i) {
+      const int64_t best_rd_in =
+          (best_rd == INT64_MAX) ? INT64_MAX
+                                 : (best_rd + (best_rd >> (first_try ? 3 : 5)));
+      const int64_t this_rd = calc_rd_given_intra_angle(
+          cpi, x, bsize, mi_row, mi_col, mode_cost, best_rd_in,
+          (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate, rd_stats,
+          &best_angle_delta, &best_tx_size, &best_rd, best_model_rd,
+          best_txk_type, best_blk_skip);
       rd_cost[2 * angle_delta + i] = this_rd;
       if (first_try && this_rd == INT64_MAX) return best_rd;
       first_try = 0;
@@ -3852,28 +4139,31 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   }
 
   assert(best_rd != INT64_MAX);
-  for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
-    int64_t rd_thresh;
-    for (i = 0; i < 2; ++i) {
+  for (int angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+    for (int i = 0; i < 2; ++i) {
       int skip_search = 0;
-      rd_thresh = best_rd + (best_rd >> 5);
+      const int64_t rd_thresh = best_rd + (best_rd >> 5);
       if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
           rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
         skip_search = 1;
       if (!skip_search) {
-        calc_rd_given_intra_angle(
-            cpi, x, bsize, mode_cost, best_rd, (1 - 2 * i) * angle_delta,
-            MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
-            &best_rd, best_model_rd, best_txk_type, best_blk_skip);
+        calc_rd_given_intra_angle(cpi, x, bsize, mi_row, mi_col, mode_cost,
+                                  best_rd, (1 - 2 * i) * angle_delta,
+                                  MAX_ANGLE_DELTA, rate, rd_stats,
+                                  &best_angle_delta, &best_tx_size, &best_rd,
+                                  best_model_rd, best_txk_type, best_blk_skip);
       }
     }
   }
 
-  mbmi->tx_size = best_tx_size;
-  mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta;
-  memcpy(mbmi->txk_type, best_txk_type,
-         sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
-  memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+  if (rd_stats->rate != INT_MAX) {
+    mbmi->tx_size = best_tx_size;
+    mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta;
+    memcpy(mbmi->txk_type, best_txk_type,
+           sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+    memcpy(x->blk_skip, best_blk_skip,
+           sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
+  }
   return best_rd;
 }
 
@@ -4052,10 +4342,10 @@ static void intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
 
 // This function is used only for intra_only frames
 static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                      int *rate, int *rate_tokenonly,
-                                      int64_t *distortion, int *skippable,
-                                      BLOCK_SIZE bsize, int64_t best_rd,
-                                      PICK_MODE_CONTEXT *ctx) {
+                                      int mi_row, int mi_col, int *rate,
+                                      int *rate_tokenonly, int64_t *distortion,
+                                      int *skippable, BLOCK_SIZE bsize,
+                                      int64_t best_rd, PICK_MODE_CONTEXT *ctx) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
@@ -4098,13 +4388,14 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   MB_MODE_INFO best_mbmi = *mbmi;
   /* Y Search for intra prediction mode */
-  for (int mode_idx = DC_PRED; mode_idx < INTRA_MODES; ++mode_idx) {
+  for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END; ++mode_idx) {
     RD_STATS this_rd_stats;
     int this_rate, this_rate_tokenonly, s;
     int64_t this_distortion, this_rd, this_model_rd;
     mbmi->mode = intra_rd_search_mode_order[mode_idx];
     mbmi->angle_delta[PLANE_TYPE_Y] = 0;
-    this_model_rd = intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode]);
+    this_model_rd =
+        intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode], mi_row, mi_col);
     if (best_model_rd != INT64_MAX &&
         this_model_rd > best_model_rd + (best_model_rd >> 1))
       continue;
@@ -4113,8 +4404,9 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
     if (is_directional_mode && av1_use_angle_delta(bsize)) {
       this_rd_stats.rate = INT_MAX;
-      rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize,
-                              bmode_costs[mbmi->mode], best_rd, &best_model_rd);
+      rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &this_rate,
+                              &this_rd_stats, bsize, bmode_costs[mbmi->mode],
+                              best_rd, &best_model_rd);
     } else {
       super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
     }
@@ -4151,16 +4443,16 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   }
 
   if (try_palette) {
-    rd_pick_palette_intra_sby(cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi,
-                              best_palette_color_map, &best_rd, &best_model_rd,
-                              rate, rate_tokenonly, distortion, skippable, ctx,
-                              ctx->blk_skip);
+    rd_pick_palette_intra_sby(
+        cpi, x, bsize, mi_row, mi_col, bmode_costs[DC_PRED], &best_mbmi,
+        best_palette_color_map, &best_rd, &best_model_rd, rate, rate_tokenonly,
+        distortion, skippable, ctx, ctx->blk_skip);
   }
 
   if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) {
-    if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
-                                 skippable, bsize, bmode_costs[DC_PRED],
-                                 &best_rd, &best_model_rd, ctx)) {
+    if (rd_pick_filter_intra_sby(
+            cpi, x, mi_row, mi_col, rate, rate_tokenonly, distortion, skippable,
+            bsize, bmode_costs[DC_PRED], &best_rd, &best_model_rd, ctx)) {
       best_mbmi = *mbmi;
     }
   }
@@ -4230,16 +4522,12 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
 
 static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
                           int blk_row, int blk_col, int plane, int block,
-                          int plane_bsize, const ENTROPY_CONTEXT *a,
-                          const ENTROPY_CONTEXT *l, RD_STATS *rd_stats,
+                          int plane_bsize, TXB_CTX *txb_ctx, RD_STATS *rd_stats,
                           FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_rdcost,
                           TXB_RD_INFO *rd_info_array) {
   const struct macroblock_plane *const p = &x->plane[plane];
-  TXB_CTX txb_ctx;
-  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
   const uint16_t cur_joint_ctx =
-      (txb_ctx.dc_sign_ctx << 8) + txb_ctx.txb_skip_ctx;
-
+      (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
   const int txk_type_idx =
       av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
   // Look up RD and terminate early in case when we've already processed exactly
@@ -4264,7 +4552,7 @@ static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
 
   RD_STATS this_rd_stats;
   search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  &txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats);
+                  txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats);
 
   av1_merge_rd_stats(rd_stats, &this_rd_stats);
 
@@ -4428,8 +4716,8 @@ static void try_tx_block_no_split(
   rd_stats->zero_rate = zero_blk_rate;
   const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
   mbmi->inter_tx_size[index] = tx_size;
-  tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, pta,
-                ptl, rd_stats, ftxs_mode, ref_best_rd,
+  tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize,
+                &txb_ctx, rd_stats, ftxs_mode, ref_best_rd,
                 rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
   assert(rd_stats->rate < INT_MAX);
 
@@ -4444,12 +4732,12 @@ static void try_tx_block_no_split(
     rd_stats->rate = zero_blk_rate;
     rd_stats->dist = rd_stats->sse;
     rd_stats->skip = 1;
-    x->blk_skip[blk_row * bw + blk_col] = 1;
+    set_blk_skip(x, 0, blk_row * bw + blk_col, 1);
     p->eobs[block] = 0;
     update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
                      DCT_DCT);
   } else {
-    x->blk_skip[blk_row * bw + blk_col] = 0;
+    set_blk_skip(x, 0, blk_row * bw + blk_col, 0);
     rd_stats->skip = 0;
   }
 
@@ -4482,7 +4770,6 @@ static void try_tx_block_split(
   MACROBLOCKD *const xd = &x->e_mbd;
   const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
-  struct macroblock_plane *const p = &x->plane[0];
   const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
   const int bsw = tx_size_wide_unit[sub_txs];
   const int bsh = tx_size_high_unit[sub_txs];
@@ -4490,10 +4777,7 @@ static void try_tx_block_split(
   RD_STATS this_rd_stats;
   int this_cost_valid = 1;
   int64_t tmp_rd = 0;
-#if CONFIG_DIST_8X8
-  int sub8x8_eob[4] = { 0, 0, 0, 0 };
-  struct macroblockd_plane *const pd = &xd->plane[0];
-#endif
+
   split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1];
 
   assert(tx_size < TX_SIZES_ALL);
@@ -4511,123 +4795,22 @@ static void try_tx_block_split(
           &this_cost_valid, ftxs_mode,
           (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
 
-#if CONFIG_DIST_8X8
-      if (!x->using_dist_8x8)
-#endif
-        if (!this_cost_valid) goto LOOP_EXIT;
-#if CONFIG_DIST_8X8
-      if (x->using_dist_8x8 && tx_size == TX_8X8) {
-        sub8x8_eob[2 * (r / bsh) + (c / bsw)] = p->eobs[block];
-      }
-#endif  // CONFIG_DIST_8X8
+      if (!this_cost_valid) goto LOOP_EXIT;
+
       av1_merge_rd_stats(split_rd_stats, &this_rd_stats);
 
       tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
-#if CONFIG_DIST_8X8
-      if (!x->using_dist_8x8)
-#endif
-        if (no_split_rd < tmp_rd) {
-          this_cost_valid = 0;
-          goto LOOP_EXIT;
-        }
+
+      if (no_split_rd < tmp_rd) {
+        this_cost_valid = 0;
+        goto LOOP_EXIT;
+      }
       block += sub_step;
     }
   }
 
 LOOP_EXIT : {}
 
-#if CONFIG_DIST_8X8
-  if (x->using_dist_8x8 && this_cost_valid && tx_size == TX_8X8) {
-    const int src_stride = p->src.stride;
-    const int dst_stride = pd->dst.stride;
-
-    const uint8_t *src =
-        &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
-    const uint8_t *dst =
-        &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-
-    int64_t dist_8x8;
-    const int qindex = x->qindex;
-    const int pred_stride = block_size_wide[plane_bsize];
-    const int pred_idx = (blk_row * pred_stride + blk_col)
-                         << tx_size_wide_log2[0];
-    const int16_t *pred = &x->pred_luma[pred_idx];
-    int i, j;
-    int row, col;
-
-    uint8_t *pred8;
-    DECLARE_ALIGNED(16, uint16_t, pred8_16[8 * 8]);
-
-    dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, BLOCK_8X8,
-                            8, 8, 8, 8, qindex) *
-               16;
-
-#ifdef DEBUG_DIST_8X8
-    if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
-      assert(sum_rd_stats.sse == dist_8x8);
-#endif  // DEBUG_DIST_8X8
-
-    split_rd_stats->sse = dist_8x8;
-
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      pred8 = CONVERT_TO_BYTEPTR(pred8_16);
-    else
-      pred8 = (uint8_t *)pred8_16;
-
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      for (row = 0; row < 2; ++row) {
-        for (col = 0; col < 2; ++col) {
-          int idx = row * 2 + col;
-          int eob = sub8x8_eob[idx];
-
-          if (eob > 0) {
-            for (j = 0; j < 4; j++)
-              for (i = 0; i < 4; i++)
-                CONVERT_TO_SHORTPTR(pred8)
-                [(row * 4 + j) * 8 + 4 * col + i] =
-                    pred[(row * 4 + j) * pred_stride + 4 * col + i];
-          } else {
-            for (j = 0; j < 4; j++)
-              for (i = 0; i < 4; i++)
-                CONVERT_TO_SHORTPTR(pred8)
-                [(row * 4 + j) * 8 + 4 * col + i] = CONVERT_TO_SHORTPTR(
-                    dst)[(row * 4 + j) * dst_stride + 4 * col + i];
-          }
-        }
-      }
-    } else {
-      for (row = 0; row < 2; ++row) {
-        for (col = 0; col < 2; ++col) {
-          int idx = row * 2 + col;
-          int eob = sub8x8_eob[idx];
-
-          if (eob > 0) {
-            for (j = 0; j < 4; j++)
-              for (i = 0; i < 4; i++)
-                pred8[(row * 4 + j) * 8 + 4 * col + i] =
-                    (uint8_t)pred[(row * 4 + j) * pred_stride + 4 * col + i];
-          } else {
-            for (j = 0; j < 4; j++)
-              for (i = 0; i < 4; i++)
-                pred8[(row * 4 + j) * 8 + 4 * col + i] =
-                    dst[(row * 4 + j) * dst_stride + 4 * col + i];
-          }
-        }
-      }
-    }
-    dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, pred8, 8, BLOCK_8X8, 8, 8,
-                            8, 8, qindex) *
-               16;
-
-#ifdef DEBUG_DIST_8X8
-    if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
-      assert(sum_rd_stats.dist == dist_8x8);
-#endif  // DEBUG_DIST_8X8
-
-    split_rd_stats->dist = dist_8x8;
-    tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
-  }
-#endif  // CONFIG_DIST_8X8
   if (this_cost_valid) *split_rd = tmp_rd;
 }
 
@@ -4660,7 +4843,10 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
 
   const int try_no_split = 1;
   int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
-
+#if CONFIG_DIST_8X8
+  if (x->using_dist_8x8)
+    try_split &= tx_size_wide[tx_size] >= 16 && tx_size_high[tx_size] >= 16;
+#endif
   TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES };
 
   // TX no split
@@ -4691,11 +4877,6 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
     }
   }
 
-#if COLLECT_TX_SIZE_DATA
-  // Do not skip tx_split when collecting tx size data.
-  try_split = 1;
-#endif
-
   // TX split
   int64_t split_rd = INT64_MAX;
   RD_STATS split_rd_stats;
@@ -4707,54 +4888,6 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
                        rd_info_node, &split_rd_stats, &split_rd);
   }
 
-#if COLLECT_TX_SIZE_DATA
-  do {
-    if (tx_size <= TX_4X4 || depth >= MAX_VARTX_DEPTH) break;
-
-#if 0
-    // Randomly select blocks to collect data to reduce output file size.
-    const int rnd_val = rand() % 2;
-    if (rnd_val) break;
-#endif
-
-    const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
-    const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
-    const int within_border =
-        mi_row >= xd->tile.mi_row_start &&
-        (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) &&
-        mi_col >= xd->tile.mi_col_start &&
-        (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end);
-    if (!within_border) break;
-
-    FILE *fp = fopen(av1_tx_size_data_output_file, "a");
-    if (!fp) break;
-
-    // Split decision, RD cost, block type(inter/intra), q-index, rdmult,
-    // and block size.
-    const int split_selected = sum_rd < this_rd;
-    const int is_inter = 1;
-    const int txb_w = tx_size_wide[tx_size];
-    const int txb_h = tx_size_high[tx_size];
-    fprintf(fp, "%d,%lld,%lld,%d,%d,%d,%d,%d,", split_selected,
-            (long long)this_rd, (long long)sum_rd, cpi->common.base_qindex,
-            x->rdmult, is_inter, txb_w, txb_h);
-
-    // Residue signal.
-    const int diff_stride = block_size_wide[plane_bsize];
-    const int16_t *src_diff =
-        &p->src_diff[(blk_row * diff_stride + blk_col) * 4];
-    for (int r = 0; r < txb_h; ++r) {
-      for (int c = 0; c < txb_w; ++c) {
-        fprintf(fp, "%d,", src_diff[c]);
-      }
-      src_diff += diff_stride;
-    }
-    fprintf(fp, "\n");
-
-    fclose(fp);
-  } while (0);
-#endif  // COLLECT_TX_SIZE_DATA
-
   if (no_split.rd < split_rd) {
     ENTROPY_CONTEXT *pta = ta + blk_col;
     ENTROPY_CONTEXT *ptl = tl + blk_row;
@@ -4773,7 +4906,7 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
     mbmi->tx_size = tx_size_selected;
     update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
                      no_split.tx_type);
-    x->blk_skip[blk_row * bw + blk_col] = rd_stats->skip;
+    set_blk_skip(x, 0, blk_row * bw + blk_col, rd_stats->skip);
   } else {
     *rd_stats = split_rd_stats;
     if (split_rd == INT64_MAX) *is_cost_valid = 0;
@@ -4787,7 +4920,7 @@ static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
                                    TXB_RD_INFO_NODE *rd_info_tree) {
   MACROBLOCKD *const xd = &x->e_mbd;
   int is_cost_valid = 1;
-  int64_t this_rd = 0;
+  int64_t this_rd = 0, skip_rd = 0;
 
   if (ref_best_rd < 0) is_cost_valid = 0;
 
@@ -4818,42 +4951,39 @@ static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
     av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
     memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
     memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+    const int skip_ctx = av1_get_skip_context(xd);
+    const int s0 = x->skip_cost[skip_ctx][0];
+    const int s1 = x->skip_cost[skip_ctx][1];
 
+    skip_rd = RDCOST(x->rdmult, s1, 0);
+    this_rd = RDCOST(x->rdmult, s0, 0);
     for (idy = 0; idy < mi_height; idy += bh) {
       for (idx = 0; idx < mi_width; idx += bw) {
+        int64_t best_rd_sofar = (ref_best_rd - (AOMMIN(skip_rd, this_rd)));
         select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth,
                         plane_bsize, ctxa, ctxl, tx_above, tx_left,
-                        &pn_rd_stats, ref_best_rd - this_rd, &is_cost_valid,
-                        ftxs_mode, rd_info_tree);
+                        &pn_rd_stats, best_rd_sofar, &is_cost_valid, ftxs_mode,
+                        rd_info_tree);
         if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
           av1_invalid_rd_stats(rd_stats);
           return;
         }
         av1_merge_rd_stats(rd_stats, &pn_rd_stats);
-        this_rd +=
-            AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
-                   RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse));
+        skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+        this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
         block += step;
         if (rd_info_tree != NULL) rd_info_tree += 1;
       }
     }
+    if (skip_rd <= this_rd) {
+      rd_stats->rate = 0;
+      rd_stats->dist = rd_stats->sse;
+      rd_stats->skip = 1;
+    } else {
+      rd_stats->skip = 0;
+    }
   }
 
-  const int skip_ctx = av1_get_skip_context(xd);
-  const int s0 = x->skip_cost[skip_ctx][0];
-  const int s1 = x->skip_cost[skip_ctx][1];
-  int64_t skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
-  this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
-  if (skip_rd <= this_rd) {
-    this_rd = skip_rd;
-    rd_stats->rate = 0;
-    rd_stats->dist = rd_stats->sse;
-    rd_stats->skip = 1;
-  } else {
-    rd_stats->skip = 0;
-  }
-  if (this_rd > ref_best_rd) is_cost_valid = 0;
-
   if (!is_cost_valid) {
     // reset cost value
     av1_invalid_rd_stats(rd_stats);
@@ -4945,8 +5075,8 @@ static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
                                   .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
     rd_stats->zero_rate = zero_blk_rate;
     rd_stats->ref_rdcost = ref_best_rd;
-    tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, ta,
-                  tl, rd_stats, ftxs_mode, ref_best_rd, NULL);
+    tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize,
+                  &txb_ctx, rd_stats, ftxs_mode, ref_best_rd, NULL);
     const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
     if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
             RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
@@ -4954,14 +5084,14 @@ static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
       rd_stats->rate = zero_blk_rate;
       rd_stats->dist = rd_stats->sse;
       rd_stats->skip = 1;
-      x->blk_skip[blk_row * mi_width + blk_col] = 1;
+      set_blk_skip(x, 0, blk_row * mi_width + blk_col, 1);
       x->plane[0].eobs[block] = 0;
       x->plane[0].txb_entropy_ctx[block] = 0;
       update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
                        DCT_DCT);
     } else {
       rd_stats->skip = 0;
-      x->blk_skip[blk_row * mi_width + blk_col] = 0;
+      set_blk_skip(x, 0, blk_row * mi_width + blk_col, 0);
     }
     if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
       rd_stats->rate += x->txfm_partition_cost[ctx][0];
@@ -5128,12 +5258,13 @@ static void fetch_tx_rd_info(int n4, const MB_RD_INFO *const tx_rd_info,
 static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
                                 const uint32_t hash) {
   // Linear search through the circular buffer to find matching hash.
-  int index;
-  for (int i = cur_record->num - 1; i >= 0; i--) {
-    index = (cur_record->index_start + i) % TX_SIZE_RD_RECORD_BUFFER_LEN;
-    if (cur_record->hash_vals[index] == hash) return index;
+  for (int i = cur_record->index_start - 1; i >= 0; i--) {
+    if (cur_record->hash_vals[i] == hash) return i;
   }
-
+  for (int i = cur_record->num - 1; i >= cur_record->index_start; i--) {
+    if (cur_record->hash_vals[i] == hash) return i;
+  }
+  int index;
   // If not found - add new RD info into the buffer and return its index
   if (cur_record->num < TX_SIZE_RD_RECORD_BUFFER_LEN) {
     index = (cur_record->index_start + cur_record->num) %
@@ -5150,6 +5281,155 @@ static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
   return index;
 }
 
+typedef struct {
+  int leaf;
+  int8_t children[4];
+} RD_RECORD_IDX_NODE;
+
+static const RD_RECORD_IDX_NODE rd_record_tree_8x8[] = {
+  { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_8x16[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 1, { 0, 0, 0, 0 } },
+  { 1, { 0, 0, 0, 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_16x8[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 1, { 0 } },
+  { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_16x16[] = {
+  { 0, { 1, 2, 3, 4 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_1_2[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 0, { 3, 4, 5, 6 } },
+  { 0, { 7, 8, 9, 10 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_2_1[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 0, { 3, 4, 7, 8 } },
+  { 0, { 5, 6, 9, 10 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_sqr[] = {
+  { 0, { 1, 2, 3, 4 } },     { 0, { 5, 6, 9, 10 } },    { 0, { 7, 8, 11, 12 } },
+  { 0, { 13, 14, 17, 18 } }, { 0, { 15, 16, 19, 20 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_64x128[] = {
+  { 0, { 2, 3, 4, 5 } },     { 0, { 6, 7, 8, 9 } },
+  { 0, { 10, 11, 14, 15 } }, { 0, { 12, 13, 16, 17 } },
+  { 0, { 18, 19, 22, 23 } }, { 0, { 20, 21, 24, 25 } },
+  { 0, { 26, 27, 30, 31 } }, { 0, { 28, 29, 32, 33 } },
+  { 0, { 34, 35, 38, 39 } }, { 0, { 36, 37, 40, 41 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_128x64[] = {
+  { 0, { 2, 3, 6, 7 } },     { 0, { 4, 5, 8, 9 } },
+  { 0, { 10, 11, 18, 19 } }, { 0, { 12, 13, 20, 21 } },
+  { 0, { 14, 15, 22, 23 } }, { 0, { 16, 17, 24, 25 } },
+  { 0, { 26, 27, 34, 35 } }, { 0, { 28, 29, 36, 37 } },
+  { 0, { 30, 31, 38, 39 } }, { 0, { 32, 33, 40, 41 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_128x128[] = {
+  { 0, { 4, 5, 8, 9 } },     { 0, { 6, 7, 10, 11 } },
+  { 0, { 12, 13, 16, 17 } }, { 0, { 14, 15, 18, 19 } },
+  { 0, { 20, 21, 28, 29 } }, { 0, { 22, 23, 30, 31 } },
+  { 0, { 24, 25, 32, 33 } }, { 0, { 26, 27, 34, 35 } },
+  { 0, { 36, 37, 44, 45 } }, { 0, { 38, 39, 46, 47 } },
+  { 0, { 40, 41, 48, 49 } }, { 0, { 42, 43, 50, 51 } },
+  { 0, { 52, 53, 60, 61 } }, { 0, { 54, 55, 62, 63 } },
+  { 0, { 56, 57, 64, 65 } }, { 0, { 58, 59, 66, 67 } },
+  { 0, { 68, 69, 76, 77 } }, { 0, { 70, 71, 78, 79 } },
+  { 0, { 72, 73, 80, 81 } }, { 0, { 74, 75, 82, 83 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_1_4[] = {
+  { 0, { 1, -1, 2, -1 } },
+  { 0, { 3, 4, -1, -1 } },
+  { 0, { 5, 6, -1, -1 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_4_1[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 0, { 3, 4, -1, -1 } },
+  { 0, { 5, 6, -1, -1 } },
+};
+
+static const RD_RECORD_IDX_NODE *rd_record_tree[BLOCK_SIZES_ALL] = {
+  NULL,                    // BLOCK_4X4
+  NULL,                    // BLOCK_4X8
+  NULL,                    // BLOCK_8X4
+  rd_record_tree_8x8,      // BLOCK_8X8
+  rd_record_tree_8x16,     // BLOCK_8X16
+  rd_record_tree_16x8,     // BLOCK_16X8
+  rd_record_tree_16x16,    // BLOCK_16X16
+  rd_record_tree_1_2,      // BLOCK_16X32
+  rd_record_tree_2_1,      // BLOCK_32X16
+  rd_record_tree_sqr,      // BLOCK_32X32
+  rd_record_tree_1_2,      // BLOCK_32X64
+  rd_record_tree_2_1,      // BLOCK_64X32
+  rd_record_tree_sqr,      // BLOCK_64X64
+  rd_record_tree_64x128,   // BLOCK_64X128
+  rd_record_tree_128x64,   // BLOCK_128X64
+  rd_record_tree_128x128,  // BLOCK_128X128
+  NULL,                    // BLOCK_4X16
+  NULL,                    // BLOCK_16X4
+  rd_record_tree_1_4,      // BLOCK_8X32
+  rd_record_tree_4_1,      // BLOCK_32X8
+  rd_record_tree_1_4,      // BLOCK_16X64
+  rd_record_tree_4_1,      // BLOCK_64X16
+};
+
+static const int rd_record_tree_size[BLOCK_SIZES_ALL] = {
+  0,                                                            // BLOCK_4X4
+  0,                                                            // BLOCK_4X8
+  0,                                                            // BLOCK_8X4
+  sizeof(rd_record_tree_8x8) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_8X8
+  sizeof(rd_record_tree_8x16) / sizeof(RD_RECORD_IDX_NODE),     // BLOCK_8X16
+  sizeof(rd_record_tree_16x8) / sizeof(RD_RECORD_IDX_NODE),     // BLOCK_16X8
+  sizeof(rd_record_tree_16x16) / sizeof(RD_RECORD_IDX_NODE),    // BLOCK_16X16
+  sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_16X32
+  sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X16
+  sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X32
+  sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X64
+  sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X32
+  sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X64
+  sizeof(rd_record_tree_64x128) / sizeof(RD_RECORD_IDX_NODE),   // BLOCK_64X128
+  sizeof(rd_record_tree_128x64) / sizeof(RD_RECORD_IDX_NODE),   // BLOCK_128X64
+  sizeof(rd_record_tree_128x128) / sizeof(RD_RECORD_IDX_NODE),  // BLOCK_128X128
+  0,                                                            // BLOCK_4X16
+  0,                                                            // BLOCK_16X4
+  sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_8X32
+  sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X8
+  sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_16X64
+  sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X16
+};
+
+static INLINE void init_rd_record_tree(TXB_RD_INFO_NODE *tree,
+                                       BLOCK_SIZE bsize) {
+  const RD_RECORD_IDX_NODE *rd_record = rd_record_tree[bsize];
+  const int size = rd_record_tree_size[bsize];
+  for (int i = 0; i < size; ++i) {
+    if (rd_record[i].leaf) {
+      av1_zero(tree[i].children);
+    } else {
+      for (int j = 0; j < 4; ++j) {
+        const int8_t idx = rd_record[i].children[j];
+        tree[i].children[j] = idx > 0 ? &tree[idx] : NULL;
+      }
+    }
+  }
+}
+
 // Go through all TX blocks that could be used in TX size search, compute
 // residual hash values for them and find matching RD info that stores previous
 // RD search results for these TX blocks. The idea is to prevent repeated
@@ -5168,26 +5448,23 @@ static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
 
   // Hashing is performed only for square TX sizes larger than TX_4X4
   if (max_square_tx_size < TX_8X8) return 0;
-
-  const int bw_mi = mi_size_wide[bsize];
   const int diff_stride = bw;
   const struct macroblock_plane *const p = &x->plane[0];
   const int16_t *diff = &p->src_diff[0];
-
+  init_rd_record_tree(dst_rd_info, bsize);
   // Coordinates of the top-left corner of current block within the superblock
   // measured in pixels:
   const int mi_row_in_sb = (mi_row % MAX_MIB_SIZE) << MI_SIZE_LOG2;
   const int mi_col_in_sb = (mi_col % MAX_MIB_SIZE) << MI_SIZE_LOG2;
   int cur_rd_info_idx = 0;
   int cur_tx_depth = 0;
-  uint8_t parent_idx_buf[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
-  uint8_t child_idx_buf[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
   TX_SIZE cur_tx_size = max_txsize_rect_lookup[bsize];
   while (cur_tx_depth <= MAX_VARTX_DEPTH) {
     const int cur_tx_bw = tx_size_wide[cur_tx_size];
     const int cur_tx_bh = tx_size_high[cur_tx_size];
     if (cur_tx_bw < 8 || cur_tx_bh < 8) break;
     const TX_SIZE next_tx_size = sub_tx_size_map[cur_tx_size];
+    const int tx_size_idx = cur_tx_size - TX_8X8;
     for (int row = 0; row < bh; row += cur_tx_bh) {
       for (int col = 0; col < bw; col += cur_tx_bw) {
         if (cur_tx_bw != cur_tx_bh) {
@@ -5211,48 +5488,13 @@ static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
           const int hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
                                                 (uint8_t *)hash_data,
                                                 2 * cur_tx_bw * cur_tx_bh);
-
           // Find corresponding RD info based on the hash value.
-          const int rd_record_idx =
-              row_in_sb * (MAX_MIB_SIZE >> (cur_tx_size + 1 - TX_8X8)) +
-              col_in_sb;
-
-          int idx = find_tx_size_rd_info(
-              &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx], hash);
+          const int record_idx =
+              row_in_sb * (MAX_MIB_SIZE >> (tx_size_idx + 1)) + col_in_sb;
+          TXB_RD_RECORD *records = &rd_records_table[tx_size_idx][record_idx];
+          int idx = find_tx_size_rd_info(records, hash);
           dst_rd_info[cur_rd_info_idx].rd_info_array =
-              &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx]
-                   .tx_rd_info[idx];
-        }
-
-        // Update the output quadtree RD info structure.
-        av1_zero(dst_rd_info[cur_rd_info_idx].children);
-        const int this_mi_row = row / MI_SIZE;
-        const int this_mi_col = col / MI_SIZE;
-        if (cur_tx_depth > 0) {  // Set up child pointers.
-          const int mi_index = this_mi_row * bw_mi + this_mi_col;
-          const int child_idx = child_idx_buf[mi_index];
-          assert(child_idx < 4);
-          dst_rd_info[parent_idx_buf[mi_index]].children[child_idx] =
-              &dst_rd_info[cur_rd_info_idx];
-        }
-        if (cur_tx_depth < MAX_VARTX_DEPTH) {  // Set up parent and child idx.
-          const int tx_bh_mi = cur_tx_bh / MI_SIZE;
-          const int tx_bw_mi = cur_tx_bw / MI_SIZE;
-          for (int i = this_mi_row; i < this_mi_row + tx_bh_mi; ++i) {
-            memset(parent_idx_buf + i * bw_mi + this_mi_col, cur_rd_info_idx,
-                   tx_bw_mi);
-          }
-          int child_idx = 0;
-          const int next_tx_bh_mi = tx_size_wide_unit[next_tx_size];
-          const int next_tx_bw_mi = tx_size_wide_unit[next_tx_size];
-          for (int i = this_mi_row; i < this_mi_row + tx_bh_mi;
-               i += next_tx_bh_mi) {
-            for (int j = this_mi_col; j < this_mi_col + tx_bw_mi;
-                 j += next_tx_bw_mi) {
-              assert(child_idx < 4);
-              child_idx_buf[i * bw_mi + j] = child_idx++;
-            }
-          }
+              &records->tx_rd_info[idx];
         }
         ++cur_rd_info_idx;
       }
@@ -5300,7 +5542,7 @@ static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
   const MACROBLOCKD *xd = &x->e_mbd;
   const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
 
-  *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, 1);
+  *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize);
   const int64_t mse = *dist / bw / bh;
   // Normalized quantizer takes the transform upscaling factor (8 for tx size
   // smaller than 32) into account.
@@ -5354,7 +5596,7 @@ static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize,
   memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN);
   memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
   mbmi->tx_size = tx_size;
-  memset(x->blk_skip, 1, sizeof(x->blk_skip[0]) * n4);
+  for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1);
   rd_stats->skip = 1;
   rd_stats->rate = 0;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
@@ -5388,17 +5630,21 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
     int model_rate;
     int64_t model_dist;
     int model_skip;
-    model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &model_rate, &model_dist,
-                    &model_skip, NULL, NULL, NULL, NULL);
+    model_rd_sb_fn[MODELRD_TYPE_TX_SEARCH_PRUNE](
+        cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &model_rate, &model_dist,
+        &model_skip, NULL, NULL, NULL, NULL);
     const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist);
     // If the modeled rd is a lot worse than the best so far, breakout.
     // TODO(debargha, urvang): Improve the model and make the check below
     // tighter.
     assert(cpi->sf.model_based_prune_tx_search_level >= 0 &&
            cpi->sf.model_based_prune_tx_search_level <= 2);
+    static const int prune_factor_by8[] = { 2 + MODELRD_TYPE_TX_SEARCH_PRUNE,
+                                            4 + MODELRD_TYPE_TX_SEARCH_PRUNE };
     if (!model_skip &&
-        model_rd / (5 - cpi->sf.model_based_prune_tx_search_level) >
-            ref_best_rd)
+        ((model_rd *
+          prune_factor_by8[cpi->sf.model_based_prune_tx_search_level - 1]) >>
+         3) > ref_best_rd)
       return;
   }
 
@@ -5431,7 +5677,7 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
 
   // Precompute residual hashes and find existing or add new RD records to
   // store and reuse rate and distortion values to speed up TX size search.
-  TXB_RD_INFO_NODE matched_rd_info[16 + 64 + 256];
+  TXB_RD_INFO_NODE matched_rd_info[4 + 16 + 64];
   int found_rd_info = 0;
   if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_inter_txb_hash) {
     found_rd_info =
@@ -5479,34 +5725,61 @@ static void tx_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
   assert(plane > 0);
   assert(tx_size < TX_SIZES_ALL);
   MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
   ENTROPY_CONTEXT *ta = above_ctx + blk_col;
   ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx);
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_UV]
+                                .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
   tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block, plane_bsize,
-                ta, tl, rd_stats, ftxs_mode, INT64_MAX, NULL);
+                &txb_ctx, rd_stats, ftxs_mode, INT64_MAX, NULL);
+
+  const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  const int blk_idx = blk_row * mi_width + blk_col;
+
   av1_set_txb_context(x, plane, block, tx_size, ta, tl);
+  if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+           RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
+       rd_stats->skip == 1) &&
+      !xd->lossless[mbmi->segment_id]) {
+    rd_stats->rate = zero_blk_rate;
+    rd_stats->dist = rd_stats->sse;
+  }
+
+  // Set chroma blk_skip to 0
+  set_blk_skip(x, plane, blk_idx, 0);
 }
 
 // Return value 0: early termination triggered, no valid rd cost available;
 //              1: rd cost values are valid.
 static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
                             RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                            int64_t ref_best_rd,
+                            int64_t non_skip_ref_best_rd,
+                            int64_t skip_ref_best_rd,
                             FAST_TX_SEARCH_MODE ftxs_mode) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   int plane;
   int is_cost_valid = 1;
   int64_t this_rd = 0;
+  int64_t skip_rd = 0;
 
-  if (ref_best_rd < 0) is_cost_valid = 0;
+  if ((non_skip_ref_best_rd < 0) && (skip_ref_best_rd < 0)) is_cost_valid = 0;
 
   av1_init_rd_stats(rd_stats);
 
-  if (x->skip_chroma_rd) return is_cost_valid;
+  if (x->skip_chroma_rd) {
+    if (!is_cost_valid) av1_invalid_rd_stats(rd_stats);
+
+    return is_cost_valid;
+  }
+
   const BLOCK_SIZE bsizec = scale_chroma_bsize(
       bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
 
@@ -5531,36 +5804,31 @@ static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
       const int step = bh * bw;
       ENTROPY_CONTEXT ta[MAX_MIB_SIZE];
       ENTROPY_CONTEXT tl[MAX_MIB_SIZE];
-      RD_STATS pn_rd_stats;
-      av1_init_rd_stats(&pn_rd_stats);
       av1_get_entropy_contexts(bsizec, pd, ta, tl);
 
       for (idy = 0; idy < mi_height; idy += bh) {
         for (idx = 0; idx < mi_width; idx += bw) {
+          RD_STATS pn_rd_stats;
+          av1_init_rd_stats(&pn_rd_stats);
           tx_block_uvrd(cpi, x, idy, idx, plane, block, max_tx_size,
                         plane_bsize, ta, tl, &pn_rd_stats, ftxs_mode);
+          if (pn_rd_stats.rate == INT_MAX) {
+            av1_invalid_rd_stats(rd_stats);
+            return 0;
+          }
+          av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+          this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+          skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse);
+          if ((this_rd > non_skip_ref_best_rd) &&
+              (skip_rd > skip_ref_best_rd)) {
+            av1_invalid_rd_stats(rd_stats);
+            return 0;
+          }
           block += step;
         }
       }
-
-      if (pn_rd_stats.rate == INT_MAX) {
-        is_cost_valid = 0;
-        break;
-      }
-
-      av1_merge_rd_stats(rd_stats, &pn_rd_stats);
-
-      this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist),
-                       RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse));
-
-      if (this_rd > ref_best_rd) {
-        is_cost_valid = 0;
-        break;
-      }
     }
-  }
-
-  if (!is_cost_valid) {
+  } else {
     // reset cost value
     av1_invalid_rd_stats(rd_stats);
   }
@@ -6137,9 +6405,9 @@ static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
          (mv->col >> 3) > mv_limits->col_max;
 }
 
-static INLINE int get_single_mode(int this_mode, int ref_idx,
-                                  int is_comp_pred) {
-  int single_mode;
+static INLINE PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode,
+                                              int ref_idx, int is_comp_pred) {
+  PREDICTION_MODE single_mode;
   if (is_comp_pred) {
     single_mode =
         ref_idx ? compound_ref1_mode(this_mode) : compound_ref0_mode(this_mode);
@@ -6149,63 +6417,6 @@ static INLINE int get_single_mode(int this_mode, int ref_idx,
   return single_mode;
 }
 
-/* If the current mode shares the same mv with other modes with higher prority,
- * skip this mode. This priority order is nearest > global > near. */
-static int skip_repeated_mv(const AV1_COMMON *const cm,
-                            const MACROBLOCK *const x, int this_mode,
-                            const MV_REFERENCE_FRAME ref_frames[2]) {
-  const int is_comp_pred = ref_frames[1] > INTRA_FRAME;
-  const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames);
-  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-  if (!is_comp_pred) {
-    if (this_mode == NEARMV) {
-      if (mbmi_ext->ref_mv_count[ref_frame_type] == 0) {
-        // NEARMV has the same motion vector as NEARESTMV
-        return 1;
-      }
-      if (mbmi_ext->ref_mv_count[ref_frame_type] == 1 &&
-          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
-        // NEARMV has the same motion vector as GLOBALMV
-        return 1;
-      }
-    }
-    if (this_mode == GLOBALMV) {
-      if (mbmi_ext->ref_mv_count[ref_frame_type] == 0 &&
-          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
-        // GLOBALMV has the same motion vector as NEARESTMV
-        return 1;
-      }
-    }
-  } else {
-    for (int i = 0; i < 2; ++i) {
-      const int single_mode = get_single_mode(this_mode, i, is_comp_pred);
-      if (single_mode == NEARMV) {
-        if (mbmi_ext->ref_mv_count[ref_frame_type] == 0) {
-          // NEARMV has the same motion vector as NEARESTMV in compound mode
-          return 1;
-        }
-      }
-    }
-    if (this_mode == NEAR_NEARMV) {
-      if (mbmi_ext->ref_mv_count[ref_frame_type] == 1 &&
-          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION &&
-          cm->global_motion[ref_frames[1]].wmtype <= TRANSLATION) {
-        // NEAR_NEARMV has the same motion vector as GLOBAL_GLOBALMV
-        return 1;
-      }
-    }
-    if (this_mode == GLOBAL_GLOBALMV) {
-      if (mbmi_ext->ref_mv_count[ref_frame_type] == 0 &&
-          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION &&
-          cm->global_motion[ref_frames[1]].wmtype <= TRANSLATION) {
-        // GLOBAL_GLOBALMV has the same motion vector as NEARST_NEARSTMV
-        return 1;
-      }
-    }
-  }
-  return 0;
-}
-
 static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                                 BLOCK_SIZE bsize, int_mv *cur_mv, int mi_row,
                                 int mi_col, int_mv *ref_mv_sub8x8[2],
@@ -6215,10 +6426,12 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   const int num_planes = av1_num_planes(cm);
   const int pw = block_size_wide[bsize];
   const int ph = block_size_high[bsize];
+  const int plane = 0;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
   // This function should only ever be called for compound modes
   assert(has_second_ref(mbmi));
+  const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] };
   const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
   int_mv ref_mv[2];
   int ite, ref;
@@ -6228,11 +6441,16 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   struct macroblockd_plane *const pd = &xd->plane[0];
   const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
   const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
-  int is_global[2];
+
+  ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
+  conv_params.use_jnt_comp_avg = 0;
+  WarpTypesAllowed warp_types[2];
   for (ref = 0; ref < 2; ++ref) {
     const WarpedMotionParams *const wm =
         &xd->global_motion[xd->mi[0]->ref_frame[ref]];
-    is_global[ref] = is_global_mv_block(xd->mi[0], wm->wmtype);
+    const int is_global = is_global_mv_block(xd->mi[0], wm->wmtype);
+    warp_types[ref].global_warp_allowed = is_global;
+    warp_types[ref].local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
   }
 
   // Do joint motion search in compound mode to get more accurate mv.
@@ -6244,30 +6462,38 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   };
 
   // Prediction buffer from second frame.
-  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
-  uint8_t *second_pred;
+  DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]);
+  uint8_t *second_pred = get_buf_by_bd(xd, second_pred16);
   (void)ref_mv_sub8x8;
 
+  const int have_newmv = have_nearmv_in_inter_mode(mbmi->mode);
+  const int ref_mv_idx = mbmi->ref_mv_idx + (have_newmv ? 1 : 0);
+  MV *const best_mv = &x->best_mv.as_mv;
+  const int search_range = SEARCH_RANGE_8P;
+  const int sadpb = x->sadperbit16;
   // Allow joint search multiple times iteratively for each reference frame
   // and break out of the search loop if it couldn't find a better mv.
   for (ite = 0; ite < 4; ite++) {
     struct buf_2d ref_yv12[2];
     int bestsme = INT_MAX;
-    int sadpb = x->sadperbit16;
-    MV *const best_mv = &x->best_mv.as_mv;
-    int search_range = 3;
-
     MvLimits tmp_mv_limits = x->mv_limits;
     int id = ite % 2;  // Even iterations search in the first reference frame,
                        // odd iterations search in the second. The predictor
                        // found for the 'other' reference frame is factored in.
-    const int plane = 0;
-    ConvolveParams conv_params = get_conv_params(!id, 0, plane, xd->bd);
-    conv_params.use_jnt_comp_avg = 0;
-    WarpTypesAllowed warp_types;
-    warp_types.global_warp_allowed = is_global[!id];
-    warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
-
+    if (ite >= 2 && cur_mv[!id].as_int == init_mv[!id].as_int) {
+      if (cur_mv[id].as_int == init_mv[id].as_int) {
+        break;
+      } else {
+        int_mv cur_int_mv, init_int_mv;
+        cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3;
+        cur_int_mv.as_mv.row = cur_mv[id].as_mv.col >> 3;
+        init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3;
+        init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3;
+        if (cur_int_mv.as_int == init_int_mv.as_int) {
+          break;
+        }
+      }
+    }
     for (ref = 0; ref < 2; ++ref) {
       ref_mv[ref] = av1_get_ref_mv(x, ref);
       // Swap out the reference frame for a version that's been scaled to
@@ -6294,26 +6520,16 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
     ref_yv12[1] = xd->plane[plane].pre[1];
 
     // Get the prediction block from the 'other' reference frame.
-    InterpFilters interp_filters = EIGHTTAP_REGULAR;
+    const InterpFilters interp_filters = EIGHTTAP_REGULAR;
 
     // Since we have scaled the reference frames to match the size of the
     // current frame we must use a unit scaling factor during mode selection.
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
-      av1_highbd_build_inter_predictor(
-          ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
-          &cur_mv[!id].as_mv, &cm->sf_identity, pw, ph, 0, interp_filters,
-          &warp_types, p_col, p_row, plane, MV_PRECISION_Q3, mi_col * MI_SIZE,
-          mi_row * MI_SIZE, xd, cm->allow_warped_motion);
-    } else {
-      second_pred = (uint8_t *)second_pred_alloc_16;
-      av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride,
-                                second_pred, pw, &cur_mv[!id].as_mv,
-                                &cm->sf_identity, pw, ph, &conv_params,
-                                interp_filters, &warp_types, p_col, p_row,
-                                plane, !id, MV_PRECISION_Q3, mi_col * MI_SIZE,
-                                mi_row * MI_SIZE, xd, cm->allow_warped_motion);
-    }
+    av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride,
+                              second_pred, pw, &cur_mv[!id].as_mv,
+                              &cm->sf_identity, pw, ph, &conv_params,
+                              interp_filters, &warp_types[!id], p_col, p_row,
+                              plane, !id, MV_PRECISION_Q3, mi_col * MI_SIZE,
+                              mi_row * MI_SIZE, xd, cm->allow_warped_motion);
 
     const int order_idx = id != 0;
     av1_jnt_comp_weight_assign(cm, mbmi, order_idx, &xd->jcp_param.fwd_offset,
@@ -6324,16 +6540,13 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
     if (id) xd->plane[plane].pre[0] = ref_yv12[id];
     av1_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv);
 
-    // Use the mv result from the single mode as mv predictor.
     // Use the mv result from the single mode as mv predictor.
     *best_mv = cur_mv[id].as_mv;
 
     best_mv->col >>= 3;
     best_mv->row >>= 3;
 
-    av1_set_mvcost(
-        x, id,
-        mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
+    av1_set_mvcost(x, id, ref_mv_idx);
 
     // Small-range full-pixel motion search.
     bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
@@ -6385,7 +6598,6 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
     // Restore the pointer to the first prediction buffer.
     if (id) xd->plane[plane].pre[0] = ref_yv12[0];
-
     if (bestsme < last_besterr[id]) {
       cur_mv[id].as_mv = *best_mv;
       last_besterr[id] = bestsme;
@@ -6397,10 +6609,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   *rate_mv = 0;
 
   for (ref = 0; ref < 2; ++ref) {
-    av1_set_mvcost(
-        x, ref,
-        mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
-
+    av1_set_mvcost(x, ref, ref_mv_idx);
     const int_mv curr_ref_mv = av1_get_ref_mv(x, ref);
     *rate_mv += av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv,
                                 x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
@@ -6710,16 +6919,16 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   switch (mbmi->motion_mode) {
     case SIMPLE_TRANSLATION:
-      bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
-                                      sadpb, cond_cost_list(cpi, cost_list),
-                                      &ref_mv, INT_MAX, 1, (MI_SIZE * mi_col),
-                                      (MI_SIZE * mi_row), 0);
+      bestsme = av1_full_pixel_search(
+          cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0,
+          sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1,
+          (MI_SIZE * mi_col), (MI_SIZE * mi_row), 0);
       break;
     case OBMC_CAUSAL:
-      bestsme = av1_obmc_full_pixel_diamond(
-          cpi, x, &mvp_full, step_param, sadpb,
-          MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv,
-          &(x->best_mv.as_mv), 0);
+      bestsme = av1_obmc_full_pixel_search(cpi, x, &mvp_full, step_param, sadpb,
+                                           MAX_MVSEARCH_STEPS - 1 - step_param,
+                                           1, &cpi->fn_ptr[bsize], &ref_mv,
+                                           &(x->best_mv.as_mv), 0);
       break;
     default: assert(0 && "Invalid motion mode!\n");
   }
@@ -6850,25 +7059,17 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
   av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height,
                                     cm->width, cm->height);
 
-  ConvolveParams conv_params = get_conv_params(!ref_idx, 0, plane, xd->bd);
+  ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
   WarpTypesAllowed warp_types;
   warp_types.global_warp_allowed = is_global;
   warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
 
   // Get the prediction block from the 'other' reference frame.
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    av1_highbd_build_inter_predictor(
-        ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
-        0, mbmi->interp_filters, &warp_types, p_col, p_row, plane,
-        MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd,
-        cm->allow_warped_motion);
-  } else {
-    av1_build_inter_predictor(
-        ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
-        &conv_params, mbmi->interp_filters, &warp_types, p_col, p_row, plane,
-        !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd,
-        cm->allow_warped_motion);
-  }
+  av1_build_inter_predictor(ref_yv12.buf, ref_yv12.stride, second_pred, pw,
+                            other_mv, &sf, pw, ph, &conv_params,
+                            mbmi->interp_filters, &warp_types, p_col, p_row,
+                            plane, !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE,
+                            mi_row * MI_SIZE, xd, cm->allow_warped_motion);
 
   av1_jnt_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset,
                              &xd->jcp_param.bck_offset,
@@ -6921,7 +7122,7 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   int bestsme = INT_MAX;
   int sadpb = x->sadperbit16;
   MV *const best_mv = &x->best_mv.as_mv;
-  int search_range = 3;
+  int search_range = SEARCH_RANGE_8P;
 
   MvLimits tmp_mv_limits = x->mv_limits;
 
@@ -7056,12 +7257,12 @@ static void do_masked_motion_search_indexed(
 // near mv modes to reduce distortion in subsequent blocks and also improve
 // visual quality.
 #define NEW_MV_DISCOUNT_FACTOR 8
-static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx,
-                               int ref_mv_idx,
+static INLINE void get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
+                               int ref_idx, int ref_mv_idx,
                                const MV_REFERENCE_FRAME *ref_frame,
                                const MB_MODE_INFO_EXT *mbmi_ext);
 static int discount_newmv_test(const AV1_COMP *const cpi, const MACROBLOCK *x,
-                               int this_mode, int_mv this_mv) {
+                               PREDICTION_MODE this_mode, int_mv this_mv) {
   if (this_mode == NEWMV && this_mv.as_int != 0 &&
       !cpi->rc.is_src_frame_alt_ref) {
     // Only discount new_mv when nearst_mv and all near_mv are zero, and the
@@ -7176,6 +7377,7 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
   const int N = bw * bh;
+  assert(N >= 64);
   int rate;
   int64_t dist;
   int64_t rd, best_rd = INT64_MAX;
@@ -7199,28 +7401,27 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
                         (int64_t)aom_sum_squares_i16(residual1, N)) *
                        (1 << WEDGE_WEIGHT_BITS) / 2;
   int16_t *ds = residual0;
-  if (N < 64)
-    av1_wedge_compute_delta_squares_c(ds, residual0, residual1, N);
-  else
-    av1_wedge_compute_delta_squares(ds, residual0, residual1, N);
+
+  av1_wedge_compute_delta_squares(ds, residual0, residual1, N);
 
   for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
     mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize);
 
-    // TODO(jingning): Make sse2 functions support N = 16 case
-    if (N < 64)
-      wedge_sign = av1_wedge_sign_from_residuals_c(ds, mask, N, sign_limit);
-    else
-      wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
+    wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
 
     mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
-    if (N < 64)
-      sse = av1_wedge_sse_from_residuals_c(residual1, diff10, mask, N);
-    else
-      sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+    sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
     sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
-    model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+    model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+                                                  &rate, &dist);
+    // int rate2;
+    // int64_t dist2;
+    // model_rd_with_curvfit(cpi, x, bsize, 0, sse, N, &rate2, &dist2);
+    // printf("sse %"PRId64": leagacy: %d %"PRId64", curvfit %d %"PRId64"\n",
+    // sse, rate, dist, rate2, dist2); dist = dist2;
+    // rate = rate2;
+
     rate += x->wedge_idx_cost[bsize][wedge_index];
     rd = RDCOST(x->rdmult, rate, dist);
 
@@ -7248,6 +7449,7 @@ static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi,
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
   const int N = bw * bh;
+  assert(N >= 64);
   int rate;
   int64_t dist;
   int64_t rd, best_rd = INT64_MAX;
@@ -7259,13 +7461,11 @@ static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi,
   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
   for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
     mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
-    if (N < 64)
-      sse = av1_wedge_sse_from_residuals_c(residual1, diff10, mask, N);
-    else
-      sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+    sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
     sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
-    model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+    model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+                                                  &rate, &dist);
     rate += x->wedge_idx_cost[bsize][wedge_index];
     rd = RDCOST(x->rdmult, rate, dist);
 
@@ -7317,50 +7517,45 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
-  const int N = bw * bh;
+  const int N = 1 << num_pels_log2_lookup[bsize];
   int rate;
-  uint64_t sse;
   int64_t dist;
-  int64_t rd0;
   DIFFWTD_MASK_TYPE cur_mask_type;
   int64_t best_rd = INT64_MAX;
   DIFFWTD_MASK_TYPE best_mask_type = 0;
   const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+  DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
+  uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask };
   // try each mask type and its inverse
   for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) {
     // build mask and inverse
     if (hbd)
       av1_build_compound_diffwtd_mask_highbd(
-          xd->seg_mask, cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
+          tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
           CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
     else
-      av1_build_compound_diffwtd_mask(xd->seg_mask, cur_mask_type, p0, bw, p1,
-                                      bw, bh, bw);
+      av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type,
+                                      p0, bw, p1, bw, bh, bw);
 
     // compute rd for mask
-    sse = av1_wedge_sse_from_residuals(residual1, diff10, xd->seg_mask, N);
+    uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10,
+                                                tmp_mask[cur_mask_type], N);
     sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
-    model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
-    rd0 = RDCOST(x->rdmult, rate, dist);
+    model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+                                                  &rate, &dist);
+    const int64_t rd0 = RDCOST(x->rdmult, rate, dist);
 
     if (rd0 < best_rd) {
       best_mask_type = cur_mask_type;
       best_rd = rd0;
     }
   }
-
-  // make final mask
   mbmi->interinter_comp.mask_type = best_mask_type;
-  if (hbd)
-    av1_build_compound_diffwtd_mask_highbd(
-        xd->seg_mask, mbmi->interinter_comp.mask_type, CONVERT_TO_BYTEPTR(p0),
-        bw, CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
-  else
-    av1_build_compound_diffwtd_mask(
-        xd->seg_mask, mbmi->interinter_comp.mask_type, p0, bw, p1, bw, bh, bw);
-
+  if (best_mask_type == DIFFWTD_38_INV) {
+    memcpy(xd->seg_mask, seg_mask, N * 2);
+  }
   return best_rd;
 }
 
@@ -7413,9 +7608,12 @@ static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x,
   }
 }
 
-static int interinter_compound_motion_search(
-    const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
-    const BLOCK_SIZE bsize, const int this_mode, int mi_row, int mi_col) {
+static int interinter_compound_motion_search(const AV1_COMP *const cpi,
+                                             MACROBLOCK *x,
+                                             const int_mv *const cur_mv,
+                                             const BLOCK_SIZE bsize,
+                                             const PREDICTION_MODE this_mode,
+                                             int mi_row, int mi_col) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   int_mv tmp_mv[2];
@@ -7440,11 +7638,40 @@ static int interinter_compound_motion_search(
   return tmp_rate_mv;
 }
 
+static void get_inter_predictors_masked_compound(
+    const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
+    int mi_row, int mi_col, uint8_t **preds0, uint8_t **preds1,
+    int16_t *residual1, int16_t *diff10, int *strides) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  int can_use_previous = cm->allow_warped_motion;
+  // get inter predictors to use for masked compound modes
+  av1_build_inter_predictors_for_planes_single_buf(
+      xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, can_use_previous);
+  av1_build_inter_predictors_for_planes_single_buf(
+      xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous);
+  const struct buf_2d *const src = &x->plane[0].src;
+  if (get_bitdepth_data_path_index(xd)) {
+    aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(*preds1), bw, xd->bd);
+    aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1),
+                              bw, CONVERT_TO_BYTEPTR(*preds0), bw, xd->bd);
+  } else {
+    aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1,
+                       bw);
+    aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw);
+  }
+}
+
 static int64_t build_and_cost_compound_type(
     const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
-    const BLOCK_SIZE bsize, const int this_mode, int *rs2, int rate_mv,
-    BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1,
-    int16_t *residual1, int16_t *diff10, int *strides, int mi_row, int mi_col) {
+    const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2,
+    int rate_mv, BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0,
+    uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides,
+    int mi_row, int mi_col, int mode_rate, int64_t ref_best_rd,
+    int *calc_pred_masked_compound) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -7456,19 +7683,30 @@ static int64_t build_and_cost_compound_type(
   int64_t tmp_skip_sse_sb;
   const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
 
+  if (*calc_pred_masked_compound) {
+    get_inter_predictors_masked_compound(cpi, x, bsize, mi_row, mi_col, preds0,
+                                         preds1, residual1, diff10, strides);
+    *calc_pred_masked_compound = 0;
+  }
+
   best_rd_cur =
       pick_interinter_mask(cpi, x, bsize, *preds0, *preds1, residual1, diff10);
   *rs2 += get_interinter_compound_mask_rate(x, mbmi);
   best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0);
 
-  if (have_newmv_in_inter_mode(this_mode) &&
-      use_masked_motion_search(compound_type)) {
+  // Although the true rate_mv might be different after motion search, but it
+  // is unlikely to be the best mode considering the transform rd cost and other
+  // mode overhead cost
+  int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0);
+  if (mode_rd > ref_best_rd) return INT64_MAX;
+
+  if (have_newmv_in_inter_mode(this_mode) && compound_type == COMPOUND_WEDGE) {
     *out_rate_mv = interinter_compound_motion_search(cpi, x, cur_mv, bsize,
                                                      this_mode, mi_row, mi_col);
     av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
-    av1_subtract_plane(x, bsize, 0);
-    model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
-                    &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+    model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+        cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+        &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
     rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
     if (rd >= best_rd_cur) {
       mbmi->mv[0].as_int = cur_mv[0].as_int;
@@ -7508,12 +7746,72 @@ typedef struct {
   int (*single_newmv_valid)[REF_FRAMES];
   // Pointer to array of predicted rate-distortion
   // Should point to first of 2 arrays in 2D array
-  int64_t (*modelled_rd)[REF_FRAMES];
+  int64_t (*modelled_rd)[MAX_REF_MV_SERCH][REF_FRAMES];
   InterpFilter single_filter[MB_MODE_COUNT][REF_FRAMES];
   int ref_frame_cost;
   int single_comp_cost;
+  int64_t (*simple_rd)[MAX_REF_MV_SERCH][REF_FRAMES];
+  int skip_motion_mode;
+  INTERINTRA_MODE *inter_intra_mode;
 } HandleInterModeArgs;
 
+/* If the current mode shares the same mv with other modes with higher cost,
+ * skip this mode. */
+static int skip_repeated_mv(const AV1_COMMON *const cm,
+                            const MACROBLOCK *const x,
+                            PREDICTION_MODE this_mode,
+                            const MV_REFERENCE_FRAME ref_frames[2],
+                            InterModeSearchState *search_state) {
+  const int is_comp_pred = ref_frames[1] > INTRA_FRAME;
+  const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+  PREDICTION_MODE compare_mode = MB_MODE_COUNT;
+  if (!is_comp_pred) {
+    if (this_mode == NEARMV) {
+      if (ref_mv_count == 0) {
+        // NEARMV has the same motion vector as NEARESTMV
+        compare_mode = NEARESTMV;
+      }
+      if (ref_mv_count == 1 &&
+          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+        // NEARMV has the same motion vector as GLOBALMV
+        compare_mode = GLOBALMV;
+      }
+    }
+    if (this_mode == GLOBALMV) {
+      if (ref_mv_count == 0 &&
+          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+        // GLOBALMV has the same motion vector as NEARESTMV
+        compare_mode = NEARESTMV;
+      }
+      if (ref_mv_count == 1) {
+        // GLOBALMV has the same motion vector as NEARMV
+        compare_mode = NEARMV;
+      }
+    }
+
+    if (compare_mode != MB_MODE_COUNT) {
+      // Use modelled_rd to check whether compare mode was searched
+      if (search_state->modelled_rd[compare_mode][0][ref_frames[0]] !=
+          INT64_MAX) {
+        const int16_t mode_ctx =
+            av1_mode_context_analyzer(mbmi_ext->mode_context, ref_frames);
+        const int compare_cost = cost_mv_ref(x, compare_mode, mode_ctx);
+        const int this_cost = cost_mv_ref(x, this_mode, mode_ctx);
+
+        // Only skip if the mode cost is larger than compare mode cost
+        if (this_cost > compare_cost) {
+          search_state->modelled_rd[this_mode][0][ref_frames[0]] =
+              search_state->modelled_rd[compare_mode][0][ref_frames[0]];
+          return 1;
+        }
+      }
+    }
+  }
+  return 0;
+}
+
 static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv,
                                      const AV1_COMMON *cm,
                                      const MACROBLOCK *x) {
@@ -7640,62 +7938,97 @@ static INLINE int64_t interpolation_filter_rd(
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  int tmp_rate, tmp_skip_sb = 0;
-  int64_t tmp_dist, tmp_skip_sse = INT64_MAX;
+  int tmp_rate[2], tmp_skip_sb[2] = { 1, 1 };
+  int64_t tmp_dist[2], tmp_skip_sse[2] = { 0, 0 };
 
   const InterpFilters last_best = mbmi->interp_filters;
   mbmi->interp_filters = filter_sets[filter_idx];
   const int tmp_rs =
       get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
 
-  if (!skip_pred) {
-    av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
-    av1_subtract_plane(x, bsize, 0);
-#if DNN_BASED_RD_INTERP_FILTER
-    model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 0, 0, &tmp_rate, &tmp_dist,
-                             &tmp_skip_sb, &tmp_skip_sse, NULL, NULL, NULL);
-#else
-    model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &tmp_rate, &tmp_dist, &tmp_skip_sb,
-                    &tmp_skip_sse, NULL, NULL, NULL);
-#endif
+  assert(skip_pred != 2);
+  assert((skip_pred >= 0) && (skip_pred <= cpi->default_interp_skip_flags));
+  assert(rate[0] >= 0);
+  assert(dist[0] >= 0);
+  assert((skip_txfm_sb[0] == 0) || (skip_txfm_sb[0] == 1));
+  assert(skip_sse_sb[0] >= 0);
+  assert(rate[1] >= 0);
+  assert(dist[1] >= 0);
+  assert((skip_txfm_sb[1] == 0) || (skip_txfm_sb[1] == 1));
+  assert(skip_sse_sb[1] >= 0);
+
+  if (skip_pred != cpi->default_interp_skip_flags) {
+    if (skip_pred != DEFAULT_LUMA_INTERP_SKIP_FLAG) {
+      av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+#if CONFIG_COLLECT_RD_STATS == 3
+      RD_STATS rd_stats_y;
+      select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX);
+      PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize);
+#endif  // CONFIG_COLLECT_RD_STATS == 3
+      model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+          cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0],
+          &tmp_skip_sb[0], &tmp_skip_sse[0], NULL, NULL, NULL);
+      tmp_rate[1] = tmp_rate[0];
+      tmp_dist[1] = tmp_dist[0];
+    } else {
+      // only luma MC is skipped
+      tmp_rate[1] = rate[0];
+      tmp_dist[1] = dist[0];
+    }
     if (num_planes > 1) {
-      int64_t tmp_y_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
-      if (tmp_y_rd > *rd) {
-        mbmi->interp_filters = last_best;
-        return 0;
+      for (int plane = 1; plane < num_planes; ++plane) {
+        int tmp_rate_uv, tmp_skip_sb_uv;
+        int64_t tmp_dist_uv, tmp_skip_sse_uv;
+        int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate[1], tmp_dist[1]);
+        if (tmp_rd >= *rd) {
+          mbmi->interp_filters = last_best;
+          return 0;
+        }
+        av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                       plane);
+        model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+            cpi, bsize, x, xd, plane, plane, mi_row, mi_col, &tmp_rate_uv,
+            &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL, NULL, NULL);
+        tmp_rate[1] =
+            (int)AOMMIN(((int64_t)tmp_rate[1] + (int64_t)tmp_rate_uv), INT_MAX);
+        tmp_dist[1] += tmp_dist_uv;
+        tmp_skip_sb[1] &= tmp_skip_sb_uv;
+        tmp_skip_sse[1] += tmp_skip_sse_uv;
       }
-      int tmp_rate_uv, tmp_skip_sb_uv;
-      int64_t tmp_dist_uv, tmp_skip_sse_uv;
-      av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize);
-      for (int plane = 1; plane < num_planes; ++plane)
-        av1_subtract_plane(x, bsize, plane);
-#if DNN_BASED_RD_INTERP_FILTER
-      model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 1, num_planes - 1,
-                               &tmp_rate_uv, &tmp_dist_uv, &tmp_skip_sb_uv,
-                               &tmp_skip_sse_uv, NULL, NULL, NULL);
-#else
-      model_rd_for_sb(cpi, bsize, x, xd, 1, num_planes - 1, &tmp_rate_uv,
-                      &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL,
-                      NULL, NULL);
-#endif
-      tmp_rate += tmp_rate_uv;
-      tmp_skip_sb &= tmp_skip_sb_uv;
-      tmp_dist += tmp_dist_uv;
-      tmp_skip_sse += tmp_skip_sse_uv;
     }
   } else {
-    tmp_rate = *rate;
-    tmp_dist = *dist;
+    // both luma and chroma MC is skipped
+    tmp_rate[1] = rate[1];
+    tmp_dist[1] = dist[1];
   }
-  int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
+  int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate[1], tmp_dist[1]);
+
   if (tmp_rd < *rd) {
     *rd = tmp_rd;
     *switchable_rate = tmp_rs;
-    *skip_txfm_sb = tmp_skip_sb;
-    *skip_sse_sb = tmp_skip_sse;
-    *rate = tmp_rate;
-    *dist = tmp_dist;
-    if (!skip_pred) {
+    if (skip_pred != cpi->default_interp_skip_flags) {
+      if (skip_pred == 0) {
+        // Overwrite the data as current filter is the best one
+        tmp_skip_sb[1] = tmp_skip_sb[0] & tmp_skip_sb[1];
+        tmp_skip_sse[1] = tmp_skip_sse[0] + tmp_skip_sse[1];
+        memcpy(rate, tmp_rate, sizeof(*rate) * 2);
+        memcpy(dist, tmp_dist, sizeof(*dist) * 2);
+        memcpy(skip_txfm_sb, tmp_skip_sb, sizeof(*skip_txfm_sb) * 2);
+        memcpy(skip_sse_sb, tmp_skip_sse, sizeof(*skip_sse_sb) * 2);
+        // As luma MC data is computed, no need to recompute after the search
+        x->recalc_luma_mc_data = 0;
+      } else if (skip_pred == DEFAULT_LUMA_INTERP_SKIP_FLAG) {
+        // As luma MC data is not computed, update of luma data can be skipped
+        rate[1] = tmp_rate[1];
+        dist[1] = tmp_dist[1];
+        skip_txfm_sb[1] = skip_txfm_sb[0] & tmp_skip_sb[1];
+        skip_sse_sb[1] = skip_sse_sb[0] + tmp_skip_sse[1];
+        // As luma MC data is not recomputed and current filter is the best,
+        // indicate the possibility of recomputing MC data
+        // If current buffer contains valid MC data, toggle to indicate that
+        // luma MC data needs to be recomputed
+        x->recalc_luma_mc_data ^= 1;
+      }
       swap_dst_buf(xd, dst_bufs, num_planes);
     }
     return 1;
@@ -7715,8 +8048,8 @@ static INLINE int find_best_horiz_interp_filter_rd(
   int i;
   const int bw = block_size_wide[bsize];
   assert(best_dual_mode == 0);
-  if ((bw <= 4) && (!skip_hor)) {
-    int skip_pred = 1;
+  if ((bw <= 4) && (skip_hor != cpi->default_interp_skip_flags)) {
+    int skip_pred = cpi->default_interp_skip_flags;
     // Process the filters in reverse order to enable reusing rate and
     // distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP
     for (i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) {
@@ -7726,7 +8059,7 @@ static INLINE int find_best_horiz_interp_filter_rd(
                                   dist)) {
         best_dual_mode = i;
       }
-      skip_pred = 0;
+      skip_pred = skip_hor;
     }
   } else {
     for (i = 1; i < SWITCHABLE_FILTERS; ++i) {
@@ -7751,8 +8084,8 @@ static INLINE void find_best_vert_interp_filter_rd(
     int best_dual_mode, int filter_set_size) {
   int i;
   const int bh = block_size_high[bsize];
-  if ((bh <= 4) && (!skip_ver)) {
-    int skip_pred = 1;
+  if ((bh <= 4) && (skip_ver != cpi->default_interp_skip_flags)) {
+    int skip_pred = cpi->default_interp_skip_flags;
     // Process the filters in reverse order to enable reusing rate and
     // distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP
     assert(filter_set_size == DUAL_FILTER_SET_SIZE);
@@ -7762,7 +8095,7 @@ static INLINE void find_best_vert_interp_filter_rd(
                               switchable_rate, skip_txfm_sb, skip_sse_sb,
                               dst_bufs, i, switchable_ctx, skip_pred, rate,
                               dist);
-      skip_pred = 0;
+      skip_pred = skip_ver;
     }
   } else {
     for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size;
@@ -7784,6 +8117,7 @@ static INLINE int is_interp_filter_match(const INTERPOLATION_FILTER_STATS *st,
       return 0;
     }
   }
+  if (has_second_ref(mi) && st->comp_type != mi->interinter_comp.type) return 0;
   return 1;
 }
 
@@ -7806,11 +8140,11 @@ static INLINE void save_interp_filter_search_stat(MACROBLOCK *x,
   const int comp_idx = mbmi->compound_idx;
   const int offset = x->interp_filter_stats_idx[comp_idx];
   if (offset < MAX_INTERP_FILTER_STATS) {
-    INTERPOLATION_FILTER_STATS stat = {
-      mbmi->interp_filters,
-      { mbmi->mv[0], mbmi->mv[1] },
-      { mbmi->ref_frame[0], mbmi->ref_frame[1] },
-    };
+    INTERPOLATION_FILTER_STATS stat = { mbmi->interp_filters,
+                                        { mbmi->mv[0], mbmi->mv[1] },
+                                        { mbmi->ref_frame[0],
+                                          mbmi->ref_frame[1] },
+                                        mbmi->interinter_comp.type };
     x->interp_filter_stats[comp_idx][offset] = stat;
     x->interp_filter_stats_idx[comp_idx]++;
   }
@@ -7821,15 +8155,22 @@ static int64_t interpolation_filter_search(
     int mi_row, int mi_col, const BUFFER_SET *const tmp_dst,
     BUFFER_SET *const orig_dst, InterpFilter (*const single_filter)[REF_FRAMES],
     int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb,
-    int64_t *const skip_sse_sb) {
+    int64_t *const skip_sse_sb, const int skip_build_pred,
+    HandleInterModeArgs *args, int64_t ref_best_rd) {
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const int need_search =
       av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd);
-  int i, tmp_rate;
-  int64_t tmp_dist;
+  int i;
+  // Index 0 corresponds to luma rd data and index 1 corresponds to cummulative
+  // data of all planes
+  int tmp_rate[2] = { 0, 0 };
+  int64_t tmp_dist[2] = { 0, 0 };
+  int best_skip_txfm_sb[2] = { 1, 1 };
+  int64_t best_skip_sse_sb[2] = { 0, 0 };
+  const int ref_frame = xd->mi[0]->ref_frame[0];
 
   (void)single_filter;
   int match_found = -1;
@@ -7845,18 +8186,32 @@ static int64_t interpolation_filter_search(
   switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1);
   *switchable_rate =
       get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
-  av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
-  for (int plane = 0; plane < num_planes; ++plane)
-    av1_subtract_plane(x, bsize, plane);
-#if DNN_BASED_RD_INTERP_FILTER
-  model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate,
-                           &tmp_dist, skip_txfm_sb, skip_sse_sb, NULL, NULL,
-                           NULL);
-#else
-  model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, &tmp_dist,
-                  skip_txfm_sb, skip_sse_sb, NULL, NULL, NULL);
-#endif  // DNN_BASED_RD_INTERP_FILTER
-  *rd = RDCOST(x->rdmult, *switchable_rate + tmp_rate, tmp_dist);
+  if (!skip_build_pred)
+    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+
+#if CONFIG_COLLECT_RD_STATS == 3
+  RD_STATS rd_stats_y;
+  select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX);
+  PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize);
+#endif  // CONFIG_COLLECT_RD_STATS == 3
+  model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+      cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0],
+      &best_skip_txfm_sb[0], &best_skip_sse_sb[0], NULL, NULL, NULL);
+  if (num_planes > 1)
+    model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+        cpi, bsize, x, xd, 1, num_planes - 1, mi_row, mi_col, &tmp_rate[1],
+        &tmp_dist[1], &best_skip_txfm_sb[1], &best_skip_sse_sb[1], NULL, NULL,
+        NULL);
+  tmp_rate[1] =
+      (int)AOMMIN((int64_t)tmp_rate[0] + (int64_t)tmp_rate[1], INT_MAX);
+  assert(tmp_rate[1] >= 0);
+  tmp_dist[1] = tmp_dist[0] + tmp_dist[1];
+  best_skip_txfm_sb[1] = best_skip_txfm_sb[0] & best_skip_txfm_sb[1];
+  best_skip_sse_sb[1] = best_skip_sse_sb[0] + best_skip_sse_sb[1];
+  *rd = RDCOST(x->rdmult, (*switchable_rate + tmp_rate[1]), tmp_dist[1]);
+  *skip_txfm_sb = best_skip_txfm_sb[1];
+  *skip_sse_sb = best_skip_sse_sb[1];
+  x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4);
 
   if (assign_filter != SWITCHABLE || match_found != -1) {
     return 0;
@@ -7866,22 +8221,71 @@ static int64_t interpolation_filter_search(
            av1_broadcast_interp_filter(EIGHTTAP_REGULAR));
     return 0;
   }
-  int skip_hor = 1;
-  int skip_ver = 1;
+  if (args->modelled_rd != NULL) {
+    if (has_second_ref(mbmi)) {
+      const int ref_mv_idx = mbmi->ref_mv_idx;
+      int refs[2] = { mbmi->ref_frame[0],
+                      (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+      const int mode0 = compound_ref0_mode(mbmi->mode);
+      const int mode1 = compound_ref1_mode(mbmi->mode);
+      const int64_t mrd = AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+                                 args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+      if ((*rd >> 1) > mrd && ref_best_rd < INT64_MAX) {
+        return INT64_MAX;
+      }
+    }
+  }
+
+  x->recalc_luma_mc_data = 0;
+  // skip_flag=xx (in binary form)
+  // Setting 0th flag corresonds to skipping luma MC and setting 1st bt
+  // corresponds to skipping chroma MC  skip_flag=0 corresponds to "Don't skip
+  // luma and chroma MC"  Skip flag=1 corresponds to "Skip Luma MC only"
+  // Skip_flag=2 is not a valid case
+  // skip_flag=3 corresponds to "Skip both luma and chroma MC"
+  int skip_hor = cpi->default_interp_skip_flags;
+  int skip_ver = cpi->default_interp_skip_flags;
   const int is_compound = has_second_ref(mbmi);
-  for (int k = 0; k < num_planes - 1; ++k) {
-    struct macroblockd_plane *const pd = &xd->plane[k];
-    const int bw = pd->width;
-    const int bh = pd->height;
-    for (int j = 0; j < 1 + is_compound; ++j) {
-      const MV mv = mbmi->mv[j].as_mv;
+  assert(is_intrabc_block(mbmi) == 0);
+  for (int j = 0; j < 1 + is_compound; ++j) {
+    const RefBuffer *ref_buf = &cm->frame_refs[mbmi->ref_frame[j] - LAST_FRAME];
+    const struct scale_factors *const sf = &ref_buf->sf;
+    // TODO(any): Refine skip flag calculation considering scaling
+    if (av1_is_scaled(sf)) {
+      skip_hor = 0;
+      skip_ver = 0;
+      break;
+    }
+    const MV mv = mbmi->mv[j].as_mv;
+    int skip_hor_plane = 0;
+    int skip_ver_plane = 0;
+    for (int k = 0; k < AOMMAX(1, (num_planes - 1)); ++k) {
+      struct macroblockd_plane *const pd = &xd->plane[k];
+      const int bw = pd->width;
+      const int bh = pd->height;
       const MV mv_q4 = clamp_mv_to_umv_border_sb(
           xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
       const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
       const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
-      skip_hor &= (sub_x == 0);
-      skip_ver &= (sub_y == 0);
-    }
+      skip_hor_plane |= ((sub_x == 0) << k);
+      skip_ver_plane |= ((sub_y == 0) << k);
+    }
+    skip_hor = skip_hor & skip_hor_plane;
+    skip_ver = skip_ver & skip_ver_plane;
+    // It is not valid that "luma MV is sub-pel, whereas chroma MV is not"
+    assert(skip_hor != 2);
+    assert(skip_ver != 2);
+  }
+  // When compond prediction type is compound segment wedge, luma MC and chroma
+  // MC need to go hand in hand as mask generated during luma MC is reuired for
+  // chroma MC. If skip_hor = 0 and skip_ver = 1, mask used for chroma MC during
+  // vertical filter decision may be incorrect as temporary MC evaluation
+  // overwrites the mask. Make skip_ver as 0 for this case so that mask is
+  // populated during luma MC
+  if (is_compound && mbmi->compound_idx == 1 &&
+      mbmi->interinter_comp.type == COMPOUND_DIFFWTD) {
+    assert(mbmi->comp_group_idx == 1);
+    if (skip_hor == 0 && skip_ver == 1) skip_ver = 0;
   }
   // do interp_filter search
   const int filter_set_size = DUAL_FILTER_SET_SIZE;
@@ -7895,14 +8299,14 @@ static int64_t interpolation_filter_search(
     // EIGHTTAP_REGULAR mode is calculated beforehand
     best_dual_mode = find_best_horiz_interp_filter_rd(
         x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate,
-        skip_txfm_sb, skip_sse_sb, dst_bufs, switchable_ctx, skip_hor,
-        &tmp_rate, &tmp_dist, best_dual_mode);
+        best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_hor,
+        tmp_rate, tmp_dist, best_dual_mode);
 
     // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
     find_best_vert_interp_filter_rd(
         x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate,
-        skip_txfm_sb, skip_sse_sb, dst_bufs, switchable_ctx, skip_ver,
-        &tmp_rate, &tmp_dist, best_dual_mode, filter_set_size);
+        best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_ver,
+        tmp_rate, tmp_dist, best_dual_mode, filter_set_size);
   } else {
     // EIGHTTAP_REGULAR mode is calculated beforehand
     for (i = 1; i < filter_set_size; ++i) {
@@ -7912,12 +8316,25 @@ static int64_t interpolation_filter_search(
         if (filter_x != filter_y) continue;
       }
       interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
-                              switchable_rate, skip_txfm_sb, skip_sse_sb,
-                              dst_bufs, i, switchable_ctx, 0, &tmp_rate,
-                              &tmp_dist);
+                              switchable_rate, best_skip_txfm_sb,
+                              best_skip_sse_sb, dst_bufs, i, switchable_ctx, 0,
+                              tmp_rate, tmp_dist);
+      assert(x->recalc_luma_mc_data == 0);
     }
   }
   swap_dst_buf(xd, dst_bufs, num_planes);
+  // Recompute final MC data if required
+  if (x->recalc_luma_mc_data == 1) {
+    // Recomputing final luma MC data is required only if the same was skipped
+    // in either of the directions  Condition below is necessary, but not
+    // sufficient
+    assert((skip_hor == 1) || (skip_ver == 1));
+    av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+  }
+  *skip_txfm_sb = best_skip_txfm_sb[1];
+  *skip_sse_sb = best_skip_sse_sb[1];
+  x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4);
+
   // save search results
   if (cpi->sf.skip_repeat_interpolation_filter_search) {
     assert(match_found == -1);
@@ -7926,6 +8343,301 @@ static int64_t interpolation_filter_search(
   return 0;
 }
 
+static int txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                       int mi_row, int mi_col, RD_STATS *rd_stats,
+                       RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
+                       int mode_rate, int64_t ref_best_rd) {
+  /*
+   * This function combines y and uv planes' transform search processes
+   * together, when the prediction is generated. It first does subtration to
+   * obtain the prediction error. Then it calls
+   * select_tx_type_yrd/super_block_yrd and inter_block_uvrd sequentially and
+   * handles the early terminations happen in those functions. At the end, it
+   * computes the rd_stats/_y/_uv accordingly.
+   */
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int skip_txfm_sb = 0;
+  const int num_planes = av1_num_planes(cm);
+  const int ref_frame_1 = mbmi->ref_frame[1];
+  const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0);
+  const int64_t rd_thresh =
+      ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd;
+  const int skip_ctx = av1_get_skip_context(xd);
+  const int64_t min_header_rate =
+      mode_rate + AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]);
+  // Account for minimum skip and non_skip rd.
+  // Eventually either one of them will be added to mode_rate
+  const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0);
+
+  if (min_header_rd_possible > ref_best_rd) {
+    av1_invalid_rd_stats(rd_stats_y);
+    av1_invalid_rd_stats(rd_stats);
+    return 0;
+  }
+
+  av1_init_rd_stats(rd_stats);
+  av1_init_rd_stats(rd_stats_y);
+  av1_init_rd_stats(rd_stats_uv);
+  rd_stats->rate = mode_rate;
+
+  if (!cpi->common.all_lossless)
+    check_block_skip(cpi, bsize, x, xd, 0, num_planes - 1, &skip_txfm_sb);
+  if (!skip_txfm_sb) {
+    int64_t non_skip_rdcosty = INT64_MAX;
+    int64_t skip_rdcosty = INT64_MAX;
+    int64_t min_rdcosty = INT64_MAX;
+    int is_cost_valid_uv = 0;
+
+    // cost and distortion
+    av1_subtract_plane(x, bsize, 0);
+    if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+      // Motion mode
+      select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col, rd_thresh);
+#if CONFIG_COLLECT_RD_STATS == 2
+      PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize);
+#endif  // CONFIG_COLLECT_RD_STATS == 2
+    } else {
+      super_block_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
+      memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+      for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
+        set_blk_skip(x, 0, i, rd_stats_y->skip);
+    }
+
+    if (rd_stats_y->rate == INT_MAX) {
+      av1_invalid_rd_stats(rd_stats);
+      // TODO(angiebird): check if we need this
+      // restore_dst_buf(xd, *orig_dst, num_planes);
+      mbmi->ref_frame[1] = ref_frame_1;
+      return 0;
+    }
+
+    av1_merge_rd_stats(rd_stats, rd_stats_y);
+
+    non_skip_rdcosty = RDCOST(
+        x->rdmult, rd_stats->rate + x->skip_cost[skip_ctx][0], rd_stats->dist);
+    skip_rdcosty =
+        RDCOST(x->rdmult, mode_rate + x->skip_cost[skip_ctx][1], rd_stats->sse);
+    min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty);
+
+    if (min_rdcosty > ref_best_rd) {
+      int64_t tokenonly_rdy =
+          AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist),
+                 RDCOST(x->rdmult, 0, rd_stats_y->sse));
+      // Invalidate rd_stats_y to skip the rest of the motion modes search
+      if (tokenonly_rdy - (tokenonly_rdy >> cpi->sf.adaptive_txb_search_level) >
+          rd_thresh)
+        av1_invalid_rd_stats(rd_stats_y);
+      mbmi->ref_frame[1] = ref_frame_1;
+      return 0;
+    }
+
+    if (num_planes > 1) {
+      /* clang-format off */
+      is_cost_valid_uv =
+          inter_block_uvrd(cpi, x, rd_stats_uv, bsize,
+                           ref_best_rd - non_skip_rdcosty,
+                           ref_best_rd - skip_rdcosty, FTXS_NONE);
+      if (!is_cost_valid_uv) {
+        mbmi->ref_frame[1] = ref_frame_1;
+        return 0;
+      }
+      /* clang-format on */
+      av1_merge_rd_stats(rd_stats, rd_stats_uv);
+    } else {
+      av1_init_rd_stats(rd_stats_uv);
+    }
+    if (rd_stats->skip) {
+      rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
+      rd_stats_y->rate = 0;
+      rd_stats_uv->rate = 0;
+      rd_stats->rate += x->skip_cost[skip_ctx][1];
+      mbmi->skip = 0;
+      // here mbmi->skip temporarily plays a role as what this_skip2 does
+
+      int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+      if (tmprd > ref_best_rd) {
+        mbmi->ref_frame[1] = ref_frame_1;
+        return 0;
+      }
+    } else if (!xd->lossless[mbmi->segment_id] &&
+               (RDCOST(x->rdmult,
+                       rd_stats_y->rate + rd_stats_uv->rate +
+                           x->skip_cost[skip_ctx][0],
+                       rd_stats->dist) >=
+                RDCOST(x->rdmult, x->skip_cost[skip_ctx][1], rd_stats->sse))) {
+      rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
+      rd_stats->rate += x->skip_cost[skip_ctx][1];
+      rd_stats->dist = rd_stats->sse;
+      rd_stats_y->rate = 0;
+      rd_stats_uv->rate = 0;
+      mbmi->skip = 1;
+    } else {
+      rd_stats->rate += x->skip_cost[skip_ctx][0];
+      mbmi->skip = 0;
+    }
+  } else {
+    x->skip = 1;
+    mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
+    // The cost of skip bit needs to be added.
+    mbmi->skip = 0;
+    rd_stats->rate += x->skip_cost[skip_ctx][1];
+
+    rd_stats->dist = 0;
+    rd_stats->sse = 0;
+    rd_stats_y->rate = 0;
+    rd_stats_uv->rate = 0;
+    rd_stats->skip = 1;
+    int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+    if (tmprd > ref_best_rd) {
+      mbmi->ref_frame[1] = ref_frame_1;
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static int handle_inter_intra_mode(const AV1_COMP *const cpi,
+                                   MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                   int mi_row, int mi_col, MB_MODE_INFO *mbmi,
+                                   HandleInterModeArgs *args,
+                                   int64_t ref_best_rd, int *rate_mv,
+                                   int *tmp_rate2, BUFFER_SET *orig_dst) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
+  int64_t rd, best_interintra_rd = INT64_MAX;
+  int rmode, rate_sum;
+  int64_t dist_sum;
+  int tmp_rate_mv = 0;
+  int tmp_skip_txfm_sb;
+  int bw = block_size_wide[bsize];
+  int64_t tmp_skip_sse_sb;
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]);
+  uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_);
+  uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
+  const int *const interintra_mode_cost =
+      x->interintra_mode_cost[size_group_lookup[bsize]];
+  const int_mv mv0 = mbmi->mv[0];
+  const int is_wedge_used = is_interintra_wedge_used(bsize);
+  int rwedge = is_wedge_used ? x->wedge_interintra_cost[bsize][0] : 0;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  xd->plane[0].dst.buf = tmp_buf;
+  xd->plane[0].dst.stride = bw;
+  av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
+
+  restore_dst_buf(xd, *orig_dst, num_planes);
+  mbmi->ref_frame[1] = INTRA_FRAME;
+  mbmi->use_wedge_interintra = 0;
+  best_interintra_mode = args->inter_intra_mode[mbmi->ref_frame[0]];
+  int j = 0;
+  if (cpi->sf.reuse_inter_intra_mode == 0 ||
+      best_interintra_mode == INTERINTRA_MODES) {
+    for (j = 0; j < INTERINTRA_MODES; ++j) {
+      mbmi->interintra_mode = (INTERINTRA_MODE)j;
+      rmode = interintra_mode_cost[mbmi->interintra_mode];
+      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                intrapred, bw);
+      av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+      model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](
+          cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+          &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+      rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
+      if (rd < best_interintra_rd) {
+        best_interintra_rd = rd;
+        best_interintra_mode = mbmi->interintra_mode;
+      }
+    }
+    args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
+  }
+  if (j == 0 || best_interintra_mode != II_SMOOTH_PRED) {
+    mbmi->interintra_mode = best_interintra_mode;
+    rmode = interintra_mode_cost[mbmi->interintra_mode];
+    av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                              intrapred, bw);
+    av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+  }
+  rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                           &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+  if (rd != INT64_MAX)
+    rd = RDCOST(x->rdmult, *rate_mv + rmode + rate_sum + rwedge, dist_sum);
+  best_interintra_rd = rd;
+  if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) {
+    return -1;
+  }
+  if (is_wedge_used) {
+    int64_t best_interintra_rd_nowedge = rd;
+    int64_t best_interintra_rd_wedge = INT64_MAX;
+    int_mv tmp_mv;
+    // Disable wedge search if source variance is small
+    if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
+      mbmi->use_wedge_interintra = 1;
+
+      rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
+               x->wedge_interintra_cost[bsize][1];
+
+      best_interintra_rd_wedge =
+          pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+
+      best_interintra_rd_wedge +=
+          RDCOST(x->rdmult, rmode + *rate_mv + rwedge, 0);
+      rd = INT64_MAX;
+      // Refine motion vector.
+      if (have_newmv_in_inter_mode(mbmi->mode)) {
+        // get negative of mask
+        const uint8_t *mask = av1_get_contiguous_soft_mask(
+            mbmi->interintra_wedge_index, 1, bsize);
+        tmp_mv = mbmi->mv[0];
+        compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row,
+                                      mi_col, intrapred, mask, bw, &tmp_rate_mv,
+                                      0);
+        if (mbmi->mv[0].as_int != tmp_mv.as_int) {
+          mbmi->mv[0].as_int = tmp_mv.as_int;
+          av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst,
+                                         bsize);
+          model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+              cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+          rd = RDCOST(x->rdmult, tmp_rate_mv + rmode + rate_sum + rwedge,
+                      dist_sum);
+        }
+      }
+      if (rd >= best_interintra_rd_wedge) {
+        tmp_mv.as_int = mv0.as_int;
+        tmp_rate_mv = *rate_mv;
+        av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+      }
+      // Evaluate closer to true rd
+      rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                               &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+      if (rd != INT64_MAX)
+        rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
+                    dist_sum);
+      best_interintra_rd_wedge = rd;
+      if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
+        mbmi->use_wedge_interintra = 1;
+        mbmi->mv[0].as_int = tmp_mv.as_int;
+        *tmp_rate2 += tmp_rate_mv - *rate_mv;
+        *rate_mv = tmp_rate_mv;
+      } else {
+        mbmi->use_wedge_interintra = 0;
+        mbmi->mv[0].as_int = mv0.as_int;
+        av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+      }
+    } else {
+      mbmi->use_wedge_interintra = 0;
+    }
+  }  // if (is_interintra_wedge_used(bsize))
+  if (num_planes > 1) {
+    av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize);
+  }
+  return 0;
+}
+
 // TODO(afergs): Refactor the MBMI references in here - there's four
 // TODO(afergs): Refactor optional args - add them to a struct or remove
 static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
@@ -7933,11 +8645,12 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
                               RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
                               int *disable_skip, int mi_row, int mi_col,
                               HandleInterModeArgs *const args,
-                              int64_t ref_best_rd, const int *refs, int rate_mv,
-                              BUFFER_SET *orig_dst
+                              int64_t ref_best_rd, const int *refs,
+                              int *rate_mv, BUFFER_SET *orig_dst
 #if CONFIG_COLLECT_INTER_MODE_RD_STATS
                               ,
-                              int64_t *best_est_rd
+                              TileDataEnc *tile_data, int64_t *best_est_rd,
+                              int do_tx_search, InterModesInfo *inter_modes_info
 #endif
 ) {
   const AV1_COMMON *const cm = &cpi->common;
@@ -7946,41 +8659,49 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
   MB_MODE_INFO *mbmi = xd->mi[0];
   const int is_comp_pred = has_second_ref(mbmi);
   const PREDICTION_MODE this_mode = mbmi->mode;
-  int rate2_nocoeff = 0, best_xskip, best_disable_skip = 0;
+  const int rate2_nocoeff = rd_stats->rate;
+  int best_xskip, best_disable_skip = 0;
   RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
   MB_MODE_INFO base_mbmi, best_mbmi;
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  const int rate_mv0 = *rate_mv;
+
   int interintra_allowed = cm->seq_params.enable_interintra_compound &&
                            is_interintra_allowed(mbmi) && mbmi->compound_idx;
   int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE];
-  int total_samples;
-
-  (void)rate_mv;
 
+  assert(mbmi->ref_frame[1] != INTRA_FRAME);
+  const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1];
   av1_invalid_rd_stats(&best_rd_stats);
-
   aom_clear_system_state();
-  mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0);
-  total_samples = mbmi->num_proj_ref[0];
-  rate2_nocoeff = rd_stats->rate;
+  mbmi->num_proj_ref = 1;  // assume num_proj_ref >=1
+  MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+  if (cm->switchable_motion_mode) {
+    last_motion_mode_allowed = motion_mode_allowed(xd->global_motion, xd, mbmi,
+                                                   cm->allow_warped_motion);
+  }
+  if (last_motion_mode_allowed == WARPED_CAUSAL) {
+    mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0);
+  }
+  int total_samples = mbmi->num_proj_ref;
+  if (total_samples == 0) {
+    last_motion_mode_allowed = OBMC_CAUSAL;
+  }
   base_mbmi = *mbmi;
-  MOTION_MODE last_motion_mode_allowed =
-      cm->switchable_motion_mode
-          ? motion_mode_allowed(xd->global_motion, xd, mbmi,
-                                cm->allow_warped_motion)
-          : SIMPLE_TRANSLATION;
-  assert(mbmi->ref_frame[1] != INTRA_FRAME);
-  const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1];
 
+  const int switchable_rate =
+      av1_is_interp_needed(xd) ? av1_get_switchable_rate(cm, x, xd) : 0;
   int64_t best_rd = INT64_MAX;
-
+  int best_rate_mv = rate_mv0;
   for (int mode_index = (int)SIMPLE_TRANSLATION;
        mode_index <= (int)last_motion_mode_allowed + interintra_allowed;
        mode_index++) {
+    if (args->skip_motion_mode && mode_index) continue;
     int64_t tmp_rd = INT64_MAX;
     int tmp_rate2 = rate2_nocoeff;
     int is_interintra_mode = mode_index > (int)last_motion_mode_allowed;
     int skip_txfm_sb = 0;
+    int tmp_rate_mv = rate_mv0;
 
     *mbmi = base_mbmi;
     if (is_interintra_mode) {
@@ -7995,10 +8716,9 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
       // The prediction is calculated before motion_mode_rd() is called in
       // handle_inter_mode()
     } else if (mbmi->motion_mode == OBMC_CAUSAL) {
-      mbmi->motion_mode = OBMC_CAUSAL;
-      if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
-        int tmp_rate_mv = 0;
-
+      uint32_t cur_mv = mbmi->mv[0].as_int;
+      assert(!is_comp_pred);
+      if (have_newmv_in_inter_mode(this_mode)) {
         single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, &tmp_rate_mv);
         mbmi->mv[0].as_int = x->best_mv.as_int;
 #if USE_DISCOUNT_NEWMV_TEST
@@ -8006,36 +8726,38 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
           tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
         }
 #endif
-        tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
+        tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
+      }
+      if (mbmi->mv[0].as_int != cur_mv) {
+        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
       }
-      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
       av1_build_obmc_inter_prediction(
           cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride,
           args->left_pred_buf, args->left_pred_stride);
     } else if (mbmi->motion_mode == WARPED_CAUSAL) {
       int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
       mbmi->motion_mode = WARPED_CAUSAL;
-      mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE;
+      mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
       mbmi->interp_filters = av1_broadcast_interp_filter(
           av1_unswitchable_filter(cm->interp_filter));
 
       memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
       memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
       // Select the samples according to motion vector difference
-      if (mbmi->num_proj_ref[0] > 1) {
-        mbmi->num_proj_ref[0] = selectSamples(
-            &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref[0], bsize);
+      if (mbmi->num_proj_ref > 1) {
+        mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+                                           mbmi->num_proj_ref, bsize);
       }
 
-      if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
+      if (!find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
                            mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
-                           &mbmi->wm_params[0], mi_row, mi_col)) {
+                           &mbmi->wm_params, mi_row, mi_col)) {
         // Refine MV for NEWMV mode
-        if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
-          int tmp_rate_mv = 0;
+        assert(!is_comp_pred);
+        if (have_newmv_in_inter_mode(this_mode)) {
           const int_mv mv0 = mbmi->mv[0];
-          const WarpedMotionParams wm_params0 = mbmi->wm_params[0];
-          int num_proj_ref0 = mbmi->num_proj_ref[0];
+          const WarpedMotionParams wm_params0 = mbmi->wm_params;
+          int num_proj_ref0 = mbmi->num_proj_ref;
 
           // Refine MV in a small range.
           av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts0, pts_inref0,
@@ -8057,12 +8779,12 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
               tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
             }
 #endif
-            tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
+            tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
           } else {
             // Restore the old MV and WM parameters.
             mbmi->mv[0] = mv0;
-            mbmi->wm_params[0] = wm_params0;
-            mbmi->num_proj_ref[0] = num_proj_ref0;
+            mbmi->wm_params = wm_params0;
+            mbmi->num_proj_ref = num_proj_ref0;
           }
         }
 
@@ -8071,144 +8793,10 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
         continue;
       }
     } else if (is_interintra_mode) {
-      INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
-      int64_t rd, best_interintra_rd = INT64_MAX;
-      int rmode, rate_sum;
-      int64_t dist_sum;
-      int j;
-      int tmp_rate_mv = 0;
-      int tmp_skip_txfm_sb;
-      int bw = block_size_wide[bsize];
-      int64_t tmp_skip_sse_sb;
-      DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]);
-      DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]);
-      uint8_t *tmp_buf, *intrapred;
-      const int *const interintra_mode_cost =
-          x->interintra_mode_cost[size_group_lookup[bsize]];
-
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_);
-        intrapred = CONVERT_TO_BYTEPTR(intrapred_);
-      } else {
-        tmp_buf = tmp_buf_;
-        intrapred = intrapred_;
-      }
-      const int_mv mv0 = mbmi->mv[0];
-
-      mbmi->ref_frame[1] = NONE_FRAME;
-      xd->plane[0].dst.buf = tmp_buf;
-      xd->plane[0].dst.stride = bw;
-      av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
-
-      restore_dst_buf(xd, *orig_dst, num_planes);
-      mbmi->ref_frame[1] = INTRA_FRAME;
-      mbmi->use_wedge_interintra = 0;
-      for (j = 0; j < INTERINTRA_MODES; ++j) {
-        mbmi->interintra_mode = (INTERINTRA_MODE)j;
-        rmode = interintra_mode_cost[mbmi->interintra_mode];
-        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
-                                                  intrapred, bw);
-        av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-        av1_subtract_plane(x, bsize, 0);
-        model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
-                        &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
-        rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
-        if (rd < best_interintra_rd) {
-          best_interintra_rd = rd;
-          best_interintra_mode = mbmi->interintra_mode;
-        }
-      }
-      mbmi->interintra_mode = best_interintra_mode;
-      rmode = interintra_mode_cost[mbmi->interintra_mode];
-      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
-                                                intrapred, bw);
-      av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-      rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
-                               &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
-      if (rd != INT64_MAX)
-        rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum, dist_sum);
-      best_interintra_rd = rd;
-
-      if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) {
-        // restore ref_frame[1]
-        mbmi->ref_frame[1] = ref_frame_1;
-        continue;
-      }
-
-      if (is_interintra_wedge_used(bsize)) {
-        int64_t best_interintra_rd_nowedge = INT64_MAX;
-        int64_t best_interintra_rd_wedge = INT64_MAX;
-        int_mv tmp_mv;
-        InterpFilters backup_interp_filters = mbmi->interp_filters;
-        int rwedge = x->wedge_interintra_cost[bsize][0];
-        if (rd != INT64_MAX)
-          rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum + rwedge, dist_sum);
-        best_interintra_rd_nowedge = rd;
-
-        // Disable wedge search if source variance is small
-        if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
-          mbmi->use_wedge_interintra = 1;
-
-          rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
-                   x->wedge_interintra_cost[bsize][1];
-
-          best_interintra_rd_wedge =
-              pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
-
-          best_interintra_rd_wedge +=
-              RDCOST(x->rdmult, rmode + rate_mv + rwedge, 0);
-          // Refine motion vector.
-          if (have_newmv_in_inter_mode(mbmi->mode)) {
-            // get negative of mask
-            const uint8_t *mask = av1_get_contiguous_soft_mask(
-                mbmi->interintra_wedge_index, 1, bsize);
-            tmp_mv = av1_get_ref_mv(x, 0);
-            compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row,
-                                          mi_col, intrapred, mask, bw,
-                                          &tmp_rate_mv, 0);
-            mbmi->mv[0].as_int = tmp_mv.as_int;
-            av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst,
-                                           bsize);
-            av1_subtract_plane(x, bsize, 0);
-            model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
-                            &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL,
-                            NULL);
-            rd = RDCOST(x->rdmult, tmp_rate_mv + rmode + rate_sum + rwedge,
-                        dist_sum);
-            if (rd >= best_interintra_rd_wedge) {
-              tmp_mv.as_int = mv0.as_int;
-              tmp_rate_mv = rate_mv;
-              mbmi->interp_filters = backup_interp_filters;
-              av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-            }
-          } else {
-            tmp_mv.as_int = mv0.as_int;
-            tmp_rate_mv = rate_mv;
-            av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-          }
-          // Evaluate closer to true rd
-          rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
-                                   &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
-                                   INT64_MAX);
-          if (rd != INT64_MAX)
-            rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
-                        dist_sum);
-          best_interintra_rd_wedge = rd;
-          if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
-            mbmi->use_wedge_interintra = 1;
-            mbmi->mv[0].as_int = tmp_mv.as_int;
-            tmp_rate2 += tmp_rate_mv - rate_mv;
-          } else {
-            mbmi->use_wedge_interintra = 0;
-            mbmi->mv[0].as_int = mv0.as_int;
-            mbmi->interp_filters = backup_interp_filters;
-          }
-        } else {
-          mbmi->use_wedge_interintra = 0;
-        }
-      }  // if (is_interintra_wedge_used(bsize))
-      restore_dst_buf(xd, *orig_dst, num_planes);
-      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+      const int ret = handle_inter_intra_mode(
+          cpi, x, bsize, mi_row, mi_col, mbmi, args, ref_best_rd, &tmp_rate_mv,
+          &tmp_rate2, orig_dst);
+      if (ret < 0) continue;
     }
 
     if (!cpi->common.all_lossless)
@@ -8220,8 +8808,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
     rd_stats->sse = 0;
     rd_stats->skip = 1;
     rd_stats->rate = tmp_rate2;
-    if (av1_is_interp_needed(xd))
-      rd_stats->rate += av1_get_switchable_rate(cm, x, xd);
+    if (mbmi->motion_mode != WARPED_CAUSAL) rd_stats->rate += switchable_rate;
     if (interintra_allowed) {
       rd_stats->rate += x->interintra_cost[size_group_lookup[bsize]]
                                           [mbmi->ref_frame[1] == INTRA_FRAME];
@@ -8246,167 +8833,86 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
         rd_stats->rate += x->motion_mode_cost1[bsize][mbmi->motion_mode];
       }
     }
+
     if (!skip_txfm_sb) {
 #if CONFIG_COLLECT_INTER_MODE_RD_STATS
       int64_t est_rd = 0;
       int est_skip = 0;
-      if (cpi->sf.inter_mode_rd_model_estimation) {
-        InterModeRdModel *md = &inter_mode_rd_models[mbmi->sb_type];
+      if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
+          cm->tile_rows == 1) {
+        InterModeRdModel *md = &tile_data->inter_mode_rd_models[mbmi->sb_type];
         if (md->ready) {
           const int64_t curr_sse = get_sse(cpi, x);
-          est_rd =
-              get_est_rd(mbmi->sb_type, x->rdmult, curr_sse, rd_stats->rate);
+          est_rd = get_est_rd(tile_data, mbmi->sb_type, x->rdmult, curr_sse,
+                              rd_stats->rate);
           est_skip = est_rd * 0.8 > *best_est_rd;
-#if INTER_MODE_RD_TEST
-          if (est_rd < *best_est_rd) {
-            *best_est_rd = est_rd;
-          }
-#else   // INTER_MODE_RD_TEST
           if (est_skip) {
-            ++md->skip_count;
             mbmi->ref_frame[1] = ref_frame_1;
             continue;
           } else {
             if (est_rd < *best_est_rd) {
               *best_est_rd = est_rd;
             }
-            ++md->non_skip_count;
           }
-#endif  // INTER_MODE_RD_TEST
         }
       }
 #endif  // CONFIG_COLLECT_INTER_MODE_RD_STATS
+    }
 
-      int64_t rdcosty = INT64_MAX;
-      int is_cost_valid_uv = 0;
-
-      // cost and distortion
-      av1_subtract_plane(x, bsize, 0);
-      if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
-        // Motion mode
-        select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col,
-                           ref_best_rd);
-#if CONFIG_COLLECT_RD_STATS == 2
-        PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize);
-#endif  // CONFIG_COLLECT_RD_STATS == 2
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+    if (!do_tx_search) {
+      const int64_t curr_sse = get_sse(cpi, x);
+      int est_residue_cost = 0;
+      int64_t est_dist = 0;
+      const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse,
+                                               &est_residue_cost, &est_dist);
+      (void)has_est_rd;
+      assert(has_est_rd);
+      const int mode_rate = rd_stats->rate;
+      rd_stats->rate += est_residue_cost;
+      rd_stats->dist = est_dist;
+      rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+      if (cm->reference_mode == SINGLE_REFERENCE) {
+        if (!is_comp_pred) {
+          inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
+                                rd_stats->rdcost, mbmi);
+        }
       } else {
-        super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
-        memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
-        memset(x->blk_skip, rd_stats_y->skip,
-               sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+        inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
+                              rd_stats->rdcost, mbmi);
       }
-
-      if (rd_stats_y->rate == INT_MAX) {
-        av1_invalid_rd_stats(rd_stats);
-        if (mbmi->motion_mode != SIMPLE_TRANSLATION ||
-            mbmi->ref_frame[1] == INTRA_FRAME) {
-          mbmi->ref_frame[1] = ref_frame_1;
-          continue;
-        } else {
-          restore_dst_buf(xd, *orig_dst, num_planes);
-          mbmi->ref_frame[1] = ref_frame_1;
+    } else {
+#endif
+      int mode_rate = rd_stats->rate;
+      if (!txfm_search(cpi, x, bsize, mi_row, mi_col, rd_stats, rd_stats_y,
+                       rd_stats_uv, mode_rate, ref_best_rd)) {
+        if (rd_stats_y->rate == INT_MAX && mode_index == 0) {
           return INT64_MAX;
         }
+        continue;
       }
-
-      av1_merge_rd_stats(rd_stats, rd_stats_y);
-
-      rdcosty = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-      rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, 0, rd_stats->sse));
-      if (num_planes > 1) {
-        /* clang-format off */
-        is_cost_valid_uv =
-            inter_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty,
-                             FTXS_NONE);
-        if (!is_cost_valid_uv) {
-          mbmi->ref_frame[1] = ref_frame_1;
-          continue;
+      if (!skip_txfm_sb) {
+        const int64_t curr_rd =
+            RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+        if (curr_rd < ref_best_rd) {
+          ref_best_rd = curr_rd;
         }
-        /* clang-format on */
-        av1_merge_rd_stats(rd_stats, rd_stats_uv);
-      } else {
-        av1_init_rd_stats(rd_stats_uv);
-      }
-#if CONFIG_RD_DEBUG
-      // record transform block coefficient cost
-      // TODO(angiebird): So far rd_debug tool only detects discrepancy of
-      // coefficient cost. Therefore, it is fine to copy rd_stats into mbmi
-      // here because we already collect the coefficient cost. Move this part to
-      // other place when we need to compare non-coefficient cost.
-      mbmi->rd_stats = *rd_stats;
-#endif  // CONFIG_RD_DEBUG
-      const int skip_ctx = av1_get_skip_context(xd);
-      if (rd_stats->skip) {
-        rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
-        rd_stats_y->rate = 0;
-        rd_stats_uv->rate = 0;
-        rd_stats->rate += x->skip_cost[skip_ctx][1];
-        mbmi->skip = 0;
-        // here mbmi->skip temporarily plays a role as what this_skip2 does
-      } else if (!xd->lossless[mbmi->segment_id] &&
-                 (RDCOST(x->rdmult,
-                         rd_stats_y->rate + rd_stats_uv->rate +
-                             x->skip_cost[skip_ctx][0],
-                         rd_stats->dist) >= RDCOST(x->rdmult,
-                                                   x->skip_cost[skip_ctx][1],
-                                                   rd_stats->sse))) {
-        rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
-        rd_stats->rate += x->skip_cost[skip_ctx][1];
-        rd_stats->dist = rd_stats->sse;
-        rd_stats_y->rate = 0;
-        rd_stats_uv->rate = 0;
-        mbmi->skip = 1;
-      } else {
-        rd_stats->rate += x->skip_cost[skip_ctx][0];
-        mbmi->skip = 0;
-      }
-      *disable_skip = 0;
+        *disable_skip = 0;
 #if CONFIG_COLLECT_INTER_MODE_RD_STATS
-      if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
-          cm->tile_rows == 1) {
-#if INTER_MODE_RD_TEST
-        if (md->ready) {
-          int64_t real_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-          if (est_skip) {
-            ++md->skip_count;
-            if (real_rd < ref_best_rd) {
-              ++md->fp_skip_count;
-            }
-            // int fp_skip = real_rd < ref_best_rd;
-            // printf("est_skip %d fp_skip %d est_rd %ld best_est_rd %ld real_rd
-            // %ld ref_best_rd %ld\n",
-            //        est_skip, fp_skip, est_rd, *best_est_rd, real_rd,
-            //        ref_best_rd);
-          } else {
-            ++md->non_skip_count;
-          }
+        if (cpi->sf.inter_mode_rd_model_estimation) {
+          const int skip_ctx = av1_get_skip_context(xd);
+          inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats->sse,
+                               rd_stats->dist,
+                               rd_stats_y->rate + rd_stats_uv->rate +
+                                   x->skip_cost[skip_ctx][mbmi->skip]);
         }
-#endif  // INTER_MODE_RD_TEST
-        inter_mode_data_push(mbmi->sb_type, rd_stats->sse, rd_stats->dist,
-                             rd_stats_y->rate + rd_stats_uv->rate +
-                                 x->skip_cost[skip_ctx][mbmi->skip],
-                             rd_stats->rate, ref_best_rd);
-      }
 #endif  // CONFIG_COLLECT_INTER_MODE_RD_STATS
-      int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-      if (curr_rd < ref_best_rd) {
-        ref_best_rd = curr_rd;
+      } else {
+        *disable_skip = 1;
       }
-    } else {
-      x->skip = 1;
-      *disable_skip = 1;
-      mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
-
-      // The cost of skip bit needs to be added.
-      mbmi->skip = 0;
-      rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][1];
-
-      rd_stats->dist = 0;
-      rd_stats->sse = 0;
-      rd_stats_y->rate = 0;
-      rd_stats_uv->rate = 0;
-      rd_stats->skip = 1;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
     }
+#endif
 
     if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) {
       if (is_nontrans_global_motion(xd, xd->mi[0])) {
@@ -8416,23 +8922,24 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
     }
 
     tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-    if ((mbmi->motion_mode == SIMPLE_TRANSLATION &&
-         mbmi->ref_frame[1] != INTRA_FRAME) ||
-        (tmp_rd < best_rd)) {
+    if (mode_index == 0)
+      args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd;
+    if ((mode_index == 0) || (tmp_rd < best_rd)) {
       best_mbmi = *mbmi;
       best_rd = tmp_rd;
       best_rd_stats = *rd_stats;
       best_rd_stats_y = *rd_stats_y;
+      best_rate_mv = tmp_rate_mv;
       if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv;
       memcpy(best_blk_skip, x->blk_skip,
-             sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+             sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
       best_xskip = x->skip;
       best_disable_skip = *disable_skip;
       if (best_xskip) break;
     }
   }
   mbmi->ref_frame[1] = ref_frame_1;
-
+  *rate_mv = best_rate_mv;
   if (best_rd == INT64_MAX) {
     av1_invalid_rd_stats(rd_stats);
     restore_dst_buf(xd, *orig_dst, num_planes);
@@ -8443,7 +8950,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
   *rd_stats_y = best_rd_stats_y;
   if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv;
   memcpy(x->blk_skip, best_blk_skip,
-         sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+         sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
   x->skip = best_xskip;
   *disable_skip = best_disable_skip;
 
@@ -8482,15 +8989,9 @@ static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
   return 0;
 }
 
-#ifndef NDEBUG
-static INLINE int is_single_inter_mode(int this_mode) {
-  return this_mode >= SINGLE_INTER_MODE_START &&
-         this_mode < SINGLE_INTER_MODE_END;
-}
-#endif
-
-static INLINE int get_ref_mv_offset(int single_mode, uint8_t ref_mv_idx) {
-  assert(is_single_inter_mode(single_mode));
+static INLINE int get_ref_mv_offset(PREDICTION_MODE single_mode,
+                                    uint8_t ref_mv_idx) {
+  assert(is_inter_singleref_mode(single_mode));
   int ref_mv_offset;
   if (single_mode == NEARESTMV) {
     ref_mv_offset = 0;
@@ -8502,14 +9003,15 @@ static INLINE int get_ref_mv_offset(int single_mode, uint8_t ref_mv_idx) {
   return ref_mv_offset;
 }
 
-static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx,
-                               int ref_mv_idx,
+static INLINE void get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
+                               int ref_idx, int ref_mv_idx,
                                const MV_REFERENCE_FRAME *ref_frame,
                                const MB_MODE_INFO_EXT *mbmi_ext) {
   const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame);
   const int is_comp_pred = ref_frame[1] > INTRA_FRAME;
-  const int single_mode = get_single_mode(this_mode, ref_idx, is_comp_pred);
-  assert(is_single_inter_mode(single_mode));
+  const PREDICTION_MODE single_mode =
+      get_single_mode(this_mode, ref_idx, is_comp_pred);
+  assert(is_inter_singleref_mode(single_mode));
   if (single_mode == NEWMV) {
     this_mv->as_int = INVALID_MV;
   } else if (single_mode == GLOBALMV) {
@@ -8533,7 +9035,7 @@ static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx,
 }
 
 // This function update the non-new mv for the current prediction mode
-static INLINE int build_cur_mv(int_mv *cur_mv, int this_mode,
+static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode,
                                const AV1_COMMON *cm, const MACROBLOCK *x) {
   const MACROBLOCKD *xd = &x->e_mbd;
   const MB_MODE_INFO *mbmi = xd->mi[0];
@@ -8543,7 +9045,8 @@ static INLINE int build_cur_mv(int_mv *cur_mv, int this_mode,
     int_mv this_mv;
     get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx, mbmi->ref_frame,
                 x->mbmi_ext);
-    const int single_mode = get_single_mode(this_mode, i, is_comp_pred);
+    const PREDICTION_MODE single_mode =
+        get_single_mode(this_mode, i, is_comp_pred);
     if (single_mode == NEWMV) {
       cur_mv[i] = this_mv;
     } else {
@@ -8584,18 +9087,29 @@ static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi,
   return cost;
 }
 
-static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                   BLOCK_SIZE bsize, int mi_col, int mi_row,
-                                   int_mv *cur_mv, int masked_compound_used,
-                                   BUFFER_SET *orig_dst, BUFFER_SET *tmp_dst,
-                                   int *rate_mv, int64_t *rd,
-                                   RD_STATS *rd_stats, int64_t ref_best_rd) {
+// Struct for buffers used by compound_type_rd() function.
+// For sizes and alignment of these arrays, refer to
+// alloc_compound_type_rd_buffers() function.
+typedef struct {
+  uint8_t *pred0;
+  uint8_t *pred1;
+  int16_t *residual1;          // src - pred1
+  int16_t *diff10;             // pred1 - pred0
+  uint8_t *tmp_best_mask_buf;  // backup of the best segmentation mask
+} CompoundTypeRdBuffers;
+
+static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, int mi_col, int mi_row,
+                            int_mv *cur_mv, int masked_compound_used,
+                            BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst,
+                            CompoundTypeRdBuffers *buffers, int *rate_mv,
+                            int64_t *rd, RD_STATS *rd_stats,
+                            int64_t ref_best_rd) {
   const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
-  const int this_mode = mbmi->mode;
+  const PREDICTION_MODE this_mode = mbmi->mode;
   const int bw = block_size_wide[bsize];
-  const int bh = block_size_high[bsize];
   int rate_sum, rs2;
   int64_t dist_sum;
 
@@ -8605,45 +9119,19 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
   int64_t tmp_skip_sse_sb;
   INTERINTER_COMPOUND_DATA best_compound_data;
   best_compound_data.type = COMPOUND_AVERAGE;
-  DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]);  // src - pred1
-  DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]);     // pred1 - pred0
-  uint8_t tmp_best_mask_buf[2 * MAX_SB_SQUARE];
-  uint8_t *preds0[1] = { pred0 };
-  uint8_t *preds1[1] = { pred1 };
+  uint8_t *preds0[1] = { buffers->pred0 };
+  uint8_t *preds1[1] = { buffers->pred1 };
   int strides[1] = { bw };
   int tmp_rate_mv;
   const int num_pix = 1 << num_pels_log2_lookup[bsize];
   const int mask_len = 2 * num_pix * sizeof(uint8_t);
   COMPOUND_TYPE cur_type;
   int best_compmode_interinter_cost = 0;
-  int can_use_previous = cm->allow_warped_motion;
+  int calc_pred_masked_compound = 1;
 
   best_mv[0].as_int = cur_mv[0].as_int;
   best_mv[1].as_int = cur_mv[1].as_int;
   *rd = INT64_MAX;
-  if (masked_compound_used) {
-    // get inter predictors to use for masked compound modes
-    av1_build_inter_predictors_for_planes_single_buf(
-        xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, can_use_previous);
-    av1_build_inter_predictors_for_planes_single_buf(
-        xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous);
-    const struct buf_2d *const src = &x->plane[0].src;
-    if (get_bitdepth_data_path_index(xd)) {
-      aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
-                                CONVERT_TO_BYTEPTR(pred1), bw, xd->bd);
-      aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(pred1),
-                                bw, CONVERT_TO_BYTEPTR(pred0), bw, xd->bd);
-    } else {
-      aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, pred1,
-                         bw);
-      aom_subtract_block(bh, bw, diff10, bw, pred1, bw, pred0, bw);
-    }
-  }
-  const int orig_is_best = xd->plane[0].dst.buf == orig_dst->plane[0];
-  const BUFFER_SET *backup_buf = orig_is_best ? tmp_dst : orig_dst;
-  const BUFFER_SET *best_buf = orig_is_best ? orig_dst : tmp_dst;
   for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
     if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
     if (!is_interinter_compound_used(cur_type, bsize)) continue;
@@ -8662,17 +9150,17 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
       }
       masked_type_cost += x->comp_idx_cost[comp_index_ctx][1];
       rs2 = masked_type_cost;
-      // No need to call av1_build_inter_predictors_sby here
-      // 1. COMPOUND_AVERAGE is always the first candidate
-      // 2. av1_build_inter_predictors_sby has been called by
-      // interpolation_filter_search
-      int64_t est_rd =
-          estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
-                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+      const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
+      if (mode_rd < ref_best_rd) {
+        av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+        int64_t est_rd =
+            estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                                &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+        if (est_rd != INT64_MAX)
+          best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
+      }
       // use spare buffer for following compound type try
-      restore_dst_buf(xd, *backup_buf, 1);
-      if (est_rd != INT64_MAX)
-        best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
+      restore_dst_buf(xd, *tmp_dst, 1);
     } else {
       mbmi->comp_group_idx = 1;
       masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1];
@@ -8682,19 +9170,20 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
           *rd / 3 < ref_best_rd) {
         best_rd_cur = build_and_cost_compound_type(
             cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst,
-            &tmp_rate_mv, preds0, preds1, residual1, diff10, strides, mi_row,
-            mi_col);
+            &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10,
+            strides, mi_row, mi_col, rd_stats->rate, ref_best_rd,
+            &calc_pred_masked_compound);
       }
     }
     if (best_rd_cur < *rd) {
       *rd = best_rd_cur;
       best_compound_data = mbmi->interinter_comp;
       if (masked_compound_used && cur_type != COMPOUND_TYPES - 1) {
-        memcpy(tmp_best_mask_buf, xd->seg_mask, mask_len);
+        memcpy(buffers->tmp_best_mask_buf, xd->seg_mask, mask_len);
       }
       best_compmode_interinter_cost = rs2;
       if (have_newmv_in_inter_mode(this_mode)) {
-        if (use_masked_motion_search(cur_type)) {
+        if (cur_type == COMPOUND_WEDGE) {
           best_tmp_rate_mv = tmp_rate_mv;
           best_mv[0].as_int = mbmi->mv[0].as_int;
           best_mv[1].as_int = mbmi->mv[1].as_int;
@@ -8712,28 +9201,69 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
     mbmi->comp_group_idx =
         (best_compound_data.type == COMPOUND_AVERAGE) ? 0 : 1;
     mbmi->interinter_comp = best_compound_data;
-    memcpy(xd->seg_mask, tmp_best_mask_buf, mask_len);
+    memcpy(xd->seg_mask, buffers->tmp_best_mask_buf, mask_len);
   }
   if (have_newmv_in_inter_mode(this_mode)) {
     mbmi->mv[0].as_int = best_mv[0].as_int;
     mbmi->mv[1].as_int = best_mv[1].as_int;
-    if (use_masked_motion_search(mbmi->interinter_comp.type)) {
+    if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
       rd_stats->rate += best_tmp_rate_mv - *rate_mv;
       *rate_mv = best_tmp_rate_mv;
     }
   }
-  restore_dst_buf(xd, *best_buf, 1);
+  restore_dst_buf(xd, *orig_dst, 1);
   return best_compmode_interinter_cost;
 }
 
+static INLINE int is_single_newmv_valid(HandleInterModeArgs *args,
+                                        MB_MODE_INFO *mbmi,
+                                        PREDICTION_MODE this_mode) {
+  for (int ref_idx = 0; ref_idx < 2; ++ref_idx) {
+    const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx, 1);
+    const MV_REFERENCE_FRAME ref = mbmi->ref_frame[ref_idx];
+    if (single_mode == NEWMV &&
+        args->single_newmv_valid[mbmi->ref_mv_idx][ref] == 0) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static int get_drl_refmv_count(const MACROBLOCK *const x,
+                               const MV_REFERENCE_FRAME *ref_frame,
+                               PREDICTION_MODE mode) {
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+  const int has_nearmv = have_nearmv_in_inter_mode(mode) ? 1 : 0;
+  const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+  const int only_newmv = (mode == NEWMV || mode == NEW_NEWMV);
+  const int has_drl =
+      (has_nearmv && ref_mv_count > 2) || (only_newmv && ref_mv_count > 1);
+  const int ref_set =
+      has_drl ? AOMMIN(MAX_REF_MV_SERCH, ref_mv_count - has_nearmv) : 1;
+
+  return ref_set;
+}
+
+typedef struct {
+  int64_t rd;
+  int drl_cost;
+  int rate_mv;
+  int_mv mv;
+} inter_mode_info;
+
 static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                  BLOCK_SIZE bsize, RD_STATS *rd_stats,
                                  RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
                                  int *disable_skip, int mi_row, int mi_col,
-                                 HandleInterModeArgs *args, int64_t ref_best_rd
+                                 HandleInterModeArgs *args, int64_t ref_best_rd,
+                                 uint8_t *const tmp_buf,
+                                 CompoundTypeRdBuffers *rd_buffers
 #if CONFIG_COLLECT_INTER_MODE_RD_STATS
                                  ,
-                                 int64_t *best_est_rd
+                                 TileDataEnc *tile_data, int64_t *best_est_rd,
+                                 const int do_tx_search,
+                                 InterModesInfo *inter_modes_info
 #endif
 ) {
   const AV1_COMMON *cm = &cpi->common;
@@ -8742,15 +9272,26 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   MB_MODE_INFO *mbmi = xd->mi[0];
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const int is_comp_pred = has_second_ref(mbmi);
-  const int this_mode = mbmi->mode;
+  const PREDICTION_MODE this_mode = mbmi->mode;
   int i;
   int refs[2] = { mbmi->ref_frame[0],
                   (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int rate_mv = 0;
-  DECLARE_ALIGNED(32, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-  uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_);
   int64_t rd = INT64_MAX;
-  BUFFER_SET orig_dst, tmp_dst;
+
+  // do first prediction into the destination buffer. Do the next
+  // prediction into a temporary buffer. Then keep track of which one
+  // of these currently holds the best predictor, and use the other
+  // one for future predictions. In the end, copy from tmp_buf to
+  // dst if necessary.
+  struct macroblockd_plane *p = xd->plane;
+  BUFFER_SET orig_dst = {
+    { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
+    { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+  };
+  const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE,
+                                 tmp_buf + 2 * MAX_SB_SQUARE },
+                               { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } };
 
   int skip_txfm_sb = 0;
   int64_t skip_sse_sb = INT64_MAX;
@@ -8765,36 +9306,29 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   MB_MODE_INFO best_mbmi = *mbmi;
   int best_disable_skip;
   int best_xskip;
-  int plane_rate[MAX_MB_PLANE] = { 0 };
-  int64_t plane_sse[MAX_MB_PLANE] = { 0 };
-  int64_t plane_dist[MAX_MB_PLANE] = { 0 };
   int64_t newmv_ret_val = INT64_MAX;
   int_mv backup_mv[2] = { { 0 } };
   int backup_rate_mv = 0;
+  inter_mode_info mode_info[MAX_REF_MV_SERCH];
 
   int comp_idx;
   const int search_jnt_comp = is_comp_pred & cm->seq_params.enable_jnt_comp &
                               (mbmi->mode != GLOBAL_GLOBALMV);
 
-  const int has_drl = (have_nearmv_in_inter_mode(mbmi->mode) &&
-                       mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
-                      ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) &&
-                       mbmi_ext->ref_mv_count[ref_frame_type] > 1);
-
   // TODO(jingning): This should be deprecated shortly.
-  const int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
-  const int ref_set =
-      has_drl ? AOMMIN(MAX_REF_MV_SERCH,
-                       mbmi_ext->ref_mv_count[ref_frame_type] - idx_offset)
-              : 1;
+  const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+  const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
 
   for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    mode_info[ref_mv_idx].mv.as_int = INVALID_MV;
+    mode_info[ref_mv_idx].rd = INT64_MAX;
+
     if (cpi->sf.reduce_inter_modes && ref_mv_idx > 0) {
       if (mbmi->ref_frame[0] == LAST2_FRAME ||
           mbmi->ref_frame[0] == LAST3_FRAME ||
           mbmi->ref_frame[1] == LAST2_FRAME ||
           mbmi->ref_frame[1] == LAST3_FRAME) {
-        if (mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx + idx_offset]
+        if (mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx + has_nearmv]
                 .weight < REF_CAT_LEVEL) {
           continue;
         }
@@ -8811,41 +9345,40 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     mode_ctx =
         av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
 
-    mbmi->num_proj_ref[0] = 0;
-    mbmi->num_proj_ref[1] = 0;
+    mbmi->num_proj_ref = 0;
     mbmi->motion_mode = SIMPLE_TRANSLATION;
     mbmi->ref_mv_idx = ref_mv_idx;
 
-    if (is_comp_pred) {
-      for (int ref_idx = 0; ref_idx < is_comp_pred + 1; ++ref_idx) {
-        const int single_mode =
-            get_single_mode(this_mode, ref_idx, is_comp_pred);
-        if (single_mode == NEWMV &&
-            args->single_newmv[mbmi->ref_mv_idx][mbmi->ref_frame[ref_idx]]
-                    .as_int == INVALID_MV)
-          continue;
-      }
+    if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, this_mode))) {
+      continue;
     }
 
     rd_stats->rate += args->ref_frame_cost + args->single_comp_cost;
-    rd_stats->rate +=
+    const int drl_cost =
         get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
+    rd_stats->rate += drl_cost;
+    mode_info[ref_mv_idx].drl_cost = drl_cost;
+
+    if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
+        mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
+      continue;
+    }
 
-    const RD_STATS backup_rd_stats = *rd_stats;
-    const MB_MODE_INFO backup_mbmi = *mbmi;
     int64_t best_rd2 = INT64_MAX;
 
+    const RD_STATS backup_rd_stats = *rd_stats;
     // If !search_jnt_comp, we need to force mbmi->compound_idx = 1.
     for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) {
       int rs = 0;
       int compmode_interinter_cost = 0;
-      *rd_stats = backup_rd_stats;
-      *mbmi = backup_mbmi;
       mbmi->compound_idx = comp_idx;
-
       if (is_comp_pred && comp_idx == 0) {
+        *rd_stats = backup_rd_stats;
+        mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+        if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME;
+        mbmi->num_proj_ref = 0;
+        mbmi->motion_mode = SIMPLE_TRANSLATION;
         mbmi->comp_group_idx = 0;
-        mbmi->compound_idx = 0;
 
         const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
         const int comp_index_ctx = get_comp_index_context(cm, xd);
@@ -8885,32 +9418,69 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
         } else {
           rd_stats->rate += rate_mv;
         }
-      }
-      for (i = 0; i < is_comp_pred + 1; ++i) {
-        mbmi->mv[i].as_int = cur_mv[i].as_int;
-      }
 
-      // Initialise tmp_dst and orig_dst buffers to prevent "may be used
-      // uninitialized" warnings in GCC when the stream is monochrome.
-      memset(tmp_dst.plane, 0, sizeof(tmp_dst.plane));
-      memset(tmp_dst.stride, 0, sizeof(tmp_dst.stride));
-      memset(orig_dst.plane, 0, sizeof(tmp_dst.plane));
-      memset(orig_dst.stride, 0, sizeof(tmp_dst.stride));
-
-      // do first prediction into the destination buffer. Do the next
-      // prediction into a temporary buffer. Then keep track of which one
-      // of these currently holds the best predictor, and use the other
-      // one for future predictions. In the end, copy from tmp_buf to
-      // dst if necessary.
-      for (i = 0; i < num_planes; i++) {
-        tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE;
-        tmp_dst.stride[i] = MAX_SB_SIZE;
+        if (cpi->sf.skip_repeated_newmv) {
+          if (!is_comp_pred && this_mode == NEWMV && ref_mv_idx > 0) {
+            int skip = 0;
+            int this_rate_mv = 0;
+            for (i = 0; i < ref_mv_idx; ++i) {
+              // Check if the motion search result same as previous results
+              if (cur_mv[0].as_int == args->single_newmv[i][refs[0]].as_int) {
+                // If the compared mode has no valid rd, it is unlikely this
+                // mode will be the best mode
+                if (mode_info[i].rd == INT64_MAX) {
+                  skip = 1;
+                  break;
+                }
+                // Compare the cost difference including drl cost and mv cost
+                if (mode_info[i].mv.as_int != INVALID_MV) {
+                  const int compare_cost =
+                      mode_info[i].rate_mv + mode_info[i].drl_cost;
+                  const int_mv ref_mv = av1_get_ref_mv(x, 0);
+                  this_rate_mv = av1_mv_bit_cost(&mode_info[i].mv.as_mv,
+                                                 &ref_mv.as_mv, x->nmvjointcost,
+                                                 x->mvcost, MV_COST_WEIGHT);
+                  const int this_cost = this_rate_mv + drl_cost;
+
+                  if (compare_cost < this_cost) {
+                    skip = 1;
+                    break;
+                  } else {
+                    // If the cost is less than current best result, make this
+                    // the best and update corresponding variables
+                    if (best_mbmi.ref_mv_idx == i) {
+                      assert(best_rd != INT64_MAX);
+                      best_mbmi.ref_mv_idx = ref_mv_idx;
+                      best_rd_stats.rate += this_cost - compare_cost;
+                      best_rd = RDCOST(x->rdmult, best_rd_stats.rate,
+                                       best_rd_stats.dist);
+                      if (best_rd < ref_best_rd) ref_best_rd = best_rd;
+
+                      skip = 1;
+                      break;
+                    }
+                  }
+                }
+              }
+            }
+            if (skip) {
+              args->modelled_rd[this_mode][ref_mv_idx][refs[0]] =
+                  args->modelled_rd[this_mode][i][refs[0]];
+              args->simple_rd[this_mode][ref_mv_idx][refs[0]] =
+                  args->simple_rd[this_mode][i][refs[0]];
+              mode_info[ref_mv_idx].rd = mode_info[i].rd;
+              mode_info[ref_mv_idx].rate_mv = this_rate_mv;
+              mode_info[ref_mv_idx].mv.as_int = mode_info[i].mv.as_int;
+
+              restore_dst_buf(xd, orig_dst, num_planes);
+              continue;
+            }
+          }
+        }
       }
-      for (i = 0; i < num_planes; i++) {
-        orig_dst.plane[i] = xd->plane[i].dst.buf;
-        orig_dst.stride[i] = xd->plane[i].dst.stride;
+      for (i = 0; i < is_comp_pred + 1; ++i) {
+        mbmi->mv[i].as_int = cur_mv[i].as_int;
       }
-
       const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx);
 #if USE_DISCOUNT_NEWMV_TEST
       // We don't include the cost of the second reference here, because there
@@ -8937,47 +9507,62 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
         continue;
       }
 
-      ret_val = interpolation_filter_search(
-          x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst,
-          args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb);
-      if (ret_val != 0) {
-        restore_dst_buf(xd, orig_dst, num_planes);
-        continue;
-      } else if (cpi->sf.model_based_post_interp_filter_breakout &&
-                 ref_best_rd != INT64_MAX && (rd / 6 > ref_best_rd)) {
-        restore_dst_buf(xd, orig_dst, num_planes);
-        if ((rd >> 4) > ref_best_rd) break;
-        continue;
-      }
-
+      int skip_build_pred = 0;
       if (is_comp_pred && comp_idx) {
+        // Find matching interp filter or set to default interp filter
+        const int need_search =
+            av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd);
+        int match_found = -1;
+        const InterpFilter assign_filter = cm->interp_filter;
+        if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) {
+          match_found = find_interp_filter_in_stats(x, mbmi);
+        }
+        if (!need_search || match_found == -1) {
+          set_default_interp_filters(mbmi, assign_filter);
+        }
+
         int64_t best_rd_compound;
         compmode_interinter_cost = compound_type_rd(
             cpi, x, bsize, mi_col, mi_row, cur_mv, masked_compound_used,
-            &orig_dst, &tmp_dst, &rate_mv, &best_rd_compound, rd_stats,
-            ref_best_rd);
+            &orig_dst, &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound,
+            rd_stats, ref_best_rd);
         if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) {
           restore_dst_buf(xd, orig_dst, num_planes);
           continue;
         }
-        if (mbmi->interinter_comp.type != COMPOUND_AVERAGE) {
-          int tmp_rate;
-          int64_t tmp_dist;
-          av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst,
-                                        bsize);
-          for (int plane = 0; plane < num_planes; ++plane)
-            av1_subtract_plane(x, bsize, plane);
-          model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate,
-                          &tmp_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate,
-                          plane_sse, plane_dist);
-          rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist);
+        // No need to call av1_build_inter_predictors_sby if
+        // COMPOUND_AVERAGE is selected because it is the first
+        // candidate in compound_type_rd, and the following
+        // compound types searching uses tmp_dst buffer
+        if (mbmi->interinter_comp.type == COMPOUND_AVERAGE) {
+          if (num_planes > 1)
+            av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, &orig_dst,
+                                            bsize);
+          skip_build_pred = 1;
         }
       }
 
+      ret_val = interpolation_filter_search(
+          x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst,
+          args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb,
+          skip_build_pred, args, ref_best_rd);
+      if (args->modelled_rd != NULL && !is_comp_pred) {
+        args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd;
+      }
+      if (ret_val != 0) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        continue;
+      } else if (cpi->sf.model_based_post_interp_filter_breakout &&
+                 ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        if ((rd >> 3) * 2 > ref_best_rd) break;
+        continue;
+      }
+
       if (search_jnt_comp) {
         // if 1/2 model rd is larger than best_rd in jnt_comp mode,
         // use jnt_comp mode, save additional search
-        if ((rd >> 1) > best_rd) {
+        if ((rd >> 3) * 4 > best_rd) {
           restore_dst_buf(xd, orig_dst, num_planes);
           continue;
         }
@@ -8991,31 +9576,31 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
         if (is_comp_pred) {
           const int mode0 = compound_ref0_mode(this_mode);
           const int mode1 = compound_ref1_mode(this_mode);
-          const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]],
-                                     args->modelled_rd[mode1][refs[1]]);
-          if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) {
+          const int64_t mrd =
+              AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+                     args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+          if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) {
             restore_dst_buf(xd, orig_dst, num_planes);
             continue;
           }
-        } else {
-          args->modelled_rd[this_mode][refs[0]] = rd;
-        }
-      }
-
-      if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
-        // if current pred_error modeled rd is substantially more than the best
-        // so far, do not bother doing full rd
-        if (rd / 2 > ref_best_rd) {
-          restore_dst_buf(xd, orig_dst, num_planes);
-          continue;
         }
       }
-
       rd_stats->rate += compmode_interinter_cost;
 
       if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) {
         // TODO(chengchen): this speed feature introduces big loss.
         // Need better estimation of rate distortion.
+        int dummy_rate;
+        int64_t dummy_dist;
+        int plane_rate[MAX_MB_PLANE] = { 0 };
+        int64_t plane_sse[MAX_MB_PLANE] = { 0 };
+        int64_t plane_dist[MAX_MB_PLANE] = { 0 };
+
+        model_rd_sb_fn[MODELRD_TYPE_JNT_COMPOUND](
+            cpi, bsize, x, xd, 0, num_planes - 1, mi_row, mi_col, &dummy_rate,
+            &dummy_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate, plane_sse,
+            plane_dist);
+
         rd_stats->rate += rs;
         rd_stats->rate += plane_rate[0] + plane_rate[1] + plane_rate[2];
         rd_stats_y->rate = plane_rate[0];
@@ -9028,18 +9613,21 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
         rd_stats_uv->dist = plane_dist[1] + plane_dist[2];
       } else {
 #if CONFIG_COLLECT_INTER_MODE_RD_STATS
-        ret_val =
-            motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
-                           disable_skip, mi_row, mi_col, args, ref_best_rd,
-                           refs, rate_mv, &orig_dst, best_est_rd);
+        ret_val = motion_mode_rd(
+            cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip,
+            mi_row, mi_col, args, ref_best_rd, refs, &rate_mv, &orig_dst,
+            tile_data, best_est_rd, do_tx_search, inter_modes_info);
 #else
         ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y,
                                  rd_stats_uv, disable_skip, mi_row, mi_col,
-                                 args, ref_best_rd, refs, rate_mv, &orig_dst);
+                                 args, ref_best_rd, refs, &rate_mv, &orig_dst);
 #endif
       }
+      mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int;
+      mode_info[ref_mv_idx].rate_mv = rate_mv;
       if (ret_val != INT64_MAX) {
         int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+        mode_info[ref_mv_idx].rd = tmp_rd;
         if (tmp_rd < best_rd) {
           best_rd_stats = *rd_stats;
           best_rd_stats_y = *rd_stats_y;
@@ -9049,7 +9637,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
           best_disable_skip = *disable_skip;
           best_xskip = x->skip;
           memcpy(best_blk_skip, x->blk_skip,
-                 sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w);
+                 sizeof(best_blk_skip[0]) * xd->n4_h * xd->n4_w);
         }
 
         if (tmp_rd < best_rd2) {
@@ -9062,8 +9650,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       }
       restore_dst_buf(xd, orig_dst, num_planes);
     }
-
-    args->modelled_rd = NULL;
   }
 
   if (best_rd == INT64_MAX) return INT64_MAX;
@@ -9078,7 +9664,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   assert(IMPLIES(mbmi->comp_group_idx == 1,
                  mbmi->interinter_comp.type != COMPOUND_AVERAGE));
   memcpy(x->blk_skip, best_blk_skip,
-         sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w);
+         sizeof(best_blk_skip[0]) * xd->n4_h * xd->n4_w);
 
   return RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
 }
@@ -9186,8 +9772,8 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     int sadpb = x->sadperbit16;
     int cost_list[5];
     int bestsme = av1_full_pixel_search(
-        cpi, x, bsize, &mvp_full, step_param, sadpb,
-        cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1,
+        cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0,
+        sadpb, cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1,
         (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1);
 
     x->mv_limits = tmp_mv_limits;
@@ -9229,8 +9815,8 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     } else {
       super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
       memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
-      memset(x->blk_skip, rd_stats.skip,
-             sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+      for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
+        set_blk_skip(x, 0, i, rd_stats.skip);
     }
     if (num_planes > 1) {
       super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
@@ -9254,7 +9840,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
       best_skip = x->skip;
       best_rdcost = rdc_noskip;
       memcpy(best_blk_skip, x->blk_skip,
-             sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+             sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
     }
 
     if (!xd->lossless[mbmi->segment_id]) {
@@ -9271,7 +9857,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
         best_skip = x->skip;
         best_rdcost = rdc_skip;
         memcpy(best_blk_skip, x->blk_skip,
-               sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+               sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
       }
     }
   }
@@ -9279,7 +9865,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
   *rd_cost = best_rdcost;
   x->skip = best_skip;
   memcpy(x->blk_skip, best_blk_skip,
-         sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+         sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
   return best_rd;
 }
 
@@ -9302,8 +9888,8 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
   mbmi->mv[0].as_int = 0;
 
   const int64_t intra_yrd =
-      rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y,
-                             &y_skip, bsize, best_rd, ctx);
+      rd_pick_intra_sby_mode(cpi, x, mi_row, mi_col, &rate_y, &rate_y_tokenonly,
+                             &dist_y, &y_skip, bsize, best_rd, ctx);
 
   if (intra_yrd < best_rd) {
     // Only store reconstructed luma when there's chroma RDO. When there's no
@@ -9447,6 +10033,17 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost,
   mbmi->uv_mode = UV_DC_PRED;
   mbmi->ref_frame[0] = ref_frame;
   mbmi->ref_frame[1] = second_ref_frame;
+  const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+  if (x->mbmi_ext->ref_mv_count[ref_frame_type] == UINT8_MAX) {
+    if (x->mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX ||
+        x->mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) {
+      return;
+    }
+    MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
+    av1_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count,
+                     mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
+                     mi_col, mbmi_ext->mode_context);
+  }
 
   assert(this_mode == NEAREST_NEARESTMV);
   if (!build_cur_mv(mbmi->mv, this_mode, cm, x)) {
@@ -9508,7 +10105,7 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost,
     memset(search_state->best_mbmode.inter_tx_size,
            search_state->best_mbmode.tx_size,
            sizeof(search_state->best_mbmode.inter_tx_size));
-    set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->n8_w, xd->n8_h,
+    set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->n4_w, xd->n4_h,
                   search_state->best_mbmode.skip && is_inter_block(mbmi), xd);
 
     // Set up color-related variables for skip mode.
@@ -9595,11 +10192,12 @@ static void sf_refine_fast_tx_type_search(
       } else {
         super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
         memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
-        memset(x->blk_skip, rd_stats_y.skip,
-               sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+        for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
+          set_blk_skip(x, 0, i, rd_stats_y.skip);
       }
       if (num_planes > 1) {
-        inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, FTXS_NONE);
+        inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, INT64_MAX,
+                         FTXS_NONE);
       } else {
         av1_init_rd_stats(&rd_stats_uv);
       }
@@ -9647,7 +10245,7 @@ static void sf_refine_fast_tx_type_search(
 static void set_params_rd_pick_inter_mode(
     const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args,
     BLOCK_SIZE bsize, int mi_row, int mi_col, uint16_t ref_frame_skip_mask[2],
-    uint32_t mode_skip_mask[REF_FRAMES],
+    uint32_t mode_skip_mask[REF_FRAMES], int skip_ref_frame_mask,
     unsigned int ref_costs_single[REF_FRAMES],
     unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES],
     struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
@@ -9700,18 +10298,45 @@ static void set_params_rd_pick_inter_mode(
     x->pred_mv_sad[ref_frame] = INT_MAX;
     x->mbmi_ext->mode_context[ref_frame] = 0;
     x->mbmi_ext->compound_mode_context[ref_frame] = 0;
+    mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
     if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame]) {
+      if (mbmi->partition != PARTITION_NONE &&
+          mbmi->partition != PARTITION_SPLIT) {
+        if (skip_ref_frame_mask & (1 << ref_frame)) {
+          int skip = 1;
+          for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+            if (!(skip_ref_frame_mask & (1 << r))) {
+              const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+              if (rf[0] == ref_frame || rf[1] == ref_frame) {
+                skip = 0;
+                break;
+              }
+            }
+          }
+          if (skip) continue;
+        }
+      }
       assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
       setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
                                  yv12_mb);
     }
   }
-
-  // TODO(zoeliu@google.com): To further optimize the obtaining of motion vector
-  // references for compound prediction, as not every pair of reference frames
-  // woud be examined for the RD evaluation.
+  // ref_frame = ALTREF_FRAME
   for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
     x->mbmi_ext->mode_context[ref_frame] = 0;
+    mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
+    const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES];
+    if (!((cpi->ref_frame_flags & ref_frame_flag_list[rf[0]]) &&
+          (cpi->ref_frame_flags & ref_frame_flag_list[rf[1]]))) {
+      continue;
+    }
+
+    if (mbmi->partition != PARTITION_NONE &&
+        mbmi->partition != PARTITION_SPLIT) {
+      if (skip_ref_frame_mask & (1 << ref_frame)) {
+        continue;
+      }
+    }
     av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
                      mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
                      mi_col, mbmi_ext->mode_context);
@@ -9838,9 +10463,10 @@ static void set_params_rd_pick_inter_mode(
   }
 }
 
-static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
-                                RD_STATS *rd_cost, PICK_MODE_CONTEXT *ctx,
-                                BLOCK_SIZE bsize, MB_MODE_INFO *const mbmi,
+static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+                                int mi_col, RD_STATS *rd_cost,
+                                PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize,
+                                MB_MODE_INFO *const mbmi,
                                 PALETTE_MODE_INFO *const pmi,
                                 unsigned int *ref_costs_single,
                                 InterModeSearchState *search_state) {
@@ -9867,9 +10493,9 @@ static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
   rate_overhead_palette = rd_pick_palette_intra_sby(
-      cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette,
-      best_palette_color_map, &best_rd_palette, &best_model_rd_palette, NULL,
-      NULL, NULL, NULL, ctx, best_blk_skip);
+      cpi, x, bsize, mi_row, mi_col, intra_mode_cost[DC_PRED],
+      &best_mbmi_palette, best_palette_color_map, &best_rd_palette,
+      &best_model_rd_palette, NULL, NULL, NULL, NULL, ctx, best_blk_skip);
   if (pmi->palette_size[0] == 0) return;
 
   memcpy(x->blk_skip, best_blk_skip,
@@ -9986,15 +10612,49 @@ static void init_inter_mode_search_state(InterModeSearchState *search_state,
   av1_zero(search_state->single_newmv);
   av1_zero(search_state->single_newmv_rate);
   av1_zero(search_state->single_newmv_valid);
-  for (int i = 0; i < MB_MODE_COUNT; ++i)
-    for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
-      search_state->modelled_rd[i][ref_frame] = INT64_MAX;
+  for (int i = 0; i < MB_MODE_COUNT; ++i) {
+    for (int j = 0; j < MAX_REF_MV_SERCH; ++j) {
+      for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+        search_state->modelled_rd[i][j][ref_frame] = INT64_MAX;
+        search_state->simple_rd[i][j][ref_frame] = INT64_MAX;
+      }
+    }
+  }
+
+  for (int dir = 0; dir < 2; ++dir) {
+    for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+      for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) {
+        SingleInterModeState *state;
+
+        state = &search_state->single_state[dir][mode][ref_frame];
+        state->ref_frame = NONE_FRAME;
+        state->rd = INT64_MAX;
+
+        state = &search_state->single_state_modelled[dir][mode][ref_frame];
+        state->ref_frame = NONE_FRAME;
+        state->rd = INT64_MAX;
+      }
+    }
+  }
+  for (int dir = 0; dir < 2; ++dir) {
+    for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+      for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) {
+        search_state->single_rd_order[dir][mode][ref_frame] = NONE_FRAME;
+      }
+    }
+  }
+  av1_zero(search_state->single_state_cnt);
+  av1_zero(search_state->single_state_modelled_cnt);
 }
 
+// Case 1: return 0, means don't skip this mode
+// Case 2: return 1, means skip this mode completely
+// Case 3: return 2, means skip compound only, but still try single motion modes
 static int inter_mode_search_order_independent_skip(
-    const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize, int mode_index,
-    int mi_row, int mi_col, uint32_t *mode_skip_mask,
-    uint16_t *ref_frame_skip_mask) {
+    const AV1_COMP *cpi, const PICK_MODE_CONTEXT *ctx, const MACROBLOCK *x,
+    BLOCK_SIZE bsize, int mode_index, int mi_row, int mi_col,
+    uint32_t *mode_skip_mask, uint16_t *ref_frame_skip_mask,
+    InterModeSearchState *search_state) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   const AV1_COMMON *const cm = &cpi->common;
   const struct segmentation *const seg = &cm->seg;
@@ -10003,6 +10663,32 @@ static int inter_mode_search_order_independent_skip(
   const unsigned char segment_id = mbmi->segment_id;
   const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame;
   const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode;
+  int skip_motion_mode = 0;
+  if (mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
+    const int ref_type = av1_ref_frame_type(ref_frame);
+    int skip_ref = ctx->skip_ref_frame_mask & (1 << ref_type);
+    if (ref_type <= ALTREF_FRAME && skip_ref) {
+      // Since the compound ref modes depends on the motion estimation result of
+      // two single ref modes( best mv of single ref modes as the start point )
+      // If current single ref mode is marked skip, we need to check if it will
+      // be used in compound ref modes.
+      for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+        if (!(ctx->skip_ref_frame_mask & (1 << r))) {
+          const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+          if (rf[0] == ref_type || rf[1] == ref_type) {
+            // Found a not skipped compound ref mode which contains current
+            // single ref. So this single ref can't be skipped completly
+            // Just skip it's motion mode search, still try it's simple
+            // transition mode.
+            skip_motion_mode = 1;
+            skip_ref = 0;
+            break;
+          }
+        }
+      }
+    }
+    if (skip_ref) return 1;
+  }
 
   if (cpi->sf.mode_pruning_based_on_two_pass_partition_search &&
       !x->cb_partition_scan) {
@@ -10115,9 +10801,12 @@ static int inter_mode_search_order_independent_skip(
     return 1;
   }
 
-  if (skip_repeated_mv(cm, x, this_mode, ref_frame)) {
+  if (skip_repeated_mv(cm, x, this_mode, ref_frame, search_state)) {
     return 1;
   }
+  if (skip_motion_mode) {
+    return 2;
+  }
   return 0;
 }
 
@@ -10139,12 +10828,13 @@ static INLINE void init_mbmi(MB_MODE_INFO *mbmi, int mode_index,
   set_default_interp_filters(mbmi, cm->interp_filter);
 }
 
-static int handle_intra_mode(InterModeSearchState *search_state,
-                             const AV1_COMP *cpi, MACROBLOCK *x,
-                             BLOCK_SIZE bsize, int ref_frame_cost,
-                             const PICK_MODE_CONTEXT *ctx, int disable_skip,
-                             RD_STATS *rd_stats, RD_STATS *rd_stats_y,
-                             RD_STATS *rd_stats_uv) {
+static int64_t handle_intra_mode(InterModeSearchState *search_state,
+                                 const AV1_COMP *cpi, MACROBLOCK *x,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 int ref_frame_cost,
+                                 const PICK_MODE_CONTEXT *ctx, int disable_skip,
+                                 RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+                                 RD_STATS *rd_stats_uv) {
   const AV1_COMMON *cm = &cpi->common;
   const SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -10159,9 +10849,19 @@ static int handle_intra_mode(InterModeSearchState *search_state,
   const int rows = block_size_high[bsize];
   const int cols = block_size_wide[bsize];
   const int num_planes = av1_num_planes(cm);
-  av1_init_rd_stats(rd_stats);
-  av1_init_rd_stats(rd_stats_y);
-  av1_init_rd_stats(rd_stats_uv);
+  const int skip_ctx = av1_get_skip_context(xd);
+
+  int known_rate = intra_mode_cost[mbmi->mode];
+  known_rate += ref_frame_cost;
+  if (mbmi->mode != DC_PRED && mbmi->mode != PAETH_PRED)
+    known_rate += intra_cost_penalty;
+  known_rate += AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]);
+  const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0);
+  if (known_rd > search_state->best_rd) {
+    search_state->skip_intra_modes = 1;
+    return INT64_MAX;
+  }
+
   TX_SIZE uv_tx;
   int is_directional_mode = av1_is_directional_mode(mbmi->mode);
   if (is_directional_mode && av1_use_angle_delta(bsize)) {
@@ -10178,20 +10878,33 @@ static int handle_intra_mode(InterModeSearchState *search_state,
                          search_state->directional_mode_skip_mask);
       search_state->angle_stats_ready = 1;
     }
-    if (search_state->directional_mode_skip_mask[mbmi->mode]) return 0;
+    if (search_state->directional_mode_skip_mask[mbmi->mode]) return INT64_MAX;
+    av1_init_rd_stats(rd_stats_y);
     rd_stats_y->rate = INT_MAX;
-    rd_pick_intra_angle_sby(cpi, x, &rate_dummy, rd_stats_y, bsize,
-                            intra_mode_cost[mbmi->mode], search_state->best_rd,
-                            &model_rd);
+    rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &rate_dummy, rd_stats_y,
+                            bsize, intra_mode_cost[mbmi->mode],
+                            search_state->best_rd, &model_rd);
   } else {
+    av1_init_rd_stats(rd_stats_y);
     mbmi->angle_delta[PLANE_TYPE_Y] = 0;
     super_block_yrd(cpi, x, rd_stats_y, bsize, search_state->best_rd);
   }
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
   memcpy(best_blk_skip, x->blk_skip,
          sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
-
+  int try_filter_intra = 0;
+  int64_t best_rd_tmp = INT64_MAX;
   if (mbmi->mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
+    if (rd_stats_y->rate != INT_MAX) {
+      const int tmp_rate = rd_stats_y->rate + x->filter_intra_cost[bsize][0] +
+                           intra_mode_cost[mbmi->mode];
+      best_rd_tmp = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist);
+      try_filter_intra = !((best_rd_tmp / 2) > search_state->best_rd);
+    } else {
+      try_filter_intra = !(search_state->best_mbmode.skip);
+    }
+  }
+  if (try_filter_intra) {
     RD_STATS rd_stats_y_fi;
     int filter_intra_selected_flag = 0;
     TX_SIZE best_tx_size = mbmi->tx_size;
@@ -10199,20 +10912,12 @@ static int handle_intra_mode(InterModeSearchState *search_state,
     memcpy(best_txk_type, mbmi->txk_type,
            sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
     FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED;
-    int64_t best_rd_tmp = INT64_MAX;
-    if (rd_stats_y->rate != INT_MAX) {
-      best_rd_tmp = RDCOST(x->rdmult,
-                           rd_stats_y->rate + x->filter_intra_cost[bsize][0] +
-                               intra_mode_cost[mbmi->mode],
-                           rd_stats_y->dist);
-    }
 
     mbmi->filter_intra_mode_info.use_filter_intra = 1;
     for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED;
          fi_mode < FILTER_INTRA_MODES; ++fi_mode) {
       int64_t this_rd_tmp;
       mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode;
-
       super_block_yrd(cpi, x, &rd_stats_y_fi, bsize, search_state->best_rd);
       if (rd_stats_y_fi.rate == INT_MAX) {
         continue;
@@ -10223,6 +10928,9 @@ static int handle_intra_mode(InterModeSearchState *search_state,
                                  intra_mode_cost[mbmi->mode]);
       this_rd_tmp = RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist);
 
+      if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > search_state->best_rd) {
+        break;
+      }
       if (this_rd_tmp < best_rd_tmp) {
         best_tx_size = mbmi->tx_size;
         memcpy(best_txk_type, mbmi->txk_type,
@@ -10249,12 +10957,23 @@ static int handle_intra_mode(InterModeSearchState *search_state,
       mbmi->filter_intra_mode_info.use_filter_intra = 0;
     }
   }
-
-  if (rd_stats_y->rate == INT_MAX) return 0;
-
+  if (rd_stats_y->rate == INT_MAX) return INT64_MAX;
+  const int mode_cost_y =
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, intra_mode_cost[mbmi->mode]);
+  av1_init_rd_stats(rd_stats);
+  av1_init_rd_stats(rd_stats_uv);
   if (num_planes > 1) {
     uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
     if (search_state->rate_uv_intra[uv_tx] == INT_MAX) {
+      int rate_y =
+          rd_stats_y->skip ? x->skip_cost[skip_ctx][1] : rd_stats_y->rate;
+      const int64_t rdy =
+          RDCOST(x->rdmult, rate_y + mode_cost_y, rd_stats_y->dist);
+      if (search_state->best_rd < (INT64_MAX / 2) &&
+          rdy > (search_state->best_rd + (search_state->best_rd >> 2))) {
+        search_state->skip_intra_modes = 1;
+        return INT64_MAX;
+      }
       choose_intra_uv_mode(
           cpi, x, bsize, uv_tx, &search_state->rate_uv_intra[uv_tx],
           &search_state->rate_uv_tokenonly[uv_tx],
@@ -10262,6 +10981,14 @@ static int handle_intra_mode(InterModeSearchState *search_state,
           &search_state->mode_uv[uv_tx]);
       if (try_palette) search_state->pmi_uv[uv_tx] = *pmi;
       search_state->uv_angle_delta[uv_tx] = mbmi->angle_delta[PLANE_TYPE_UV];
+
+      const int uv_rate = search_state->rate_uv_tokenonly[uv_tx];
+      const int64_t uv_dist = search_state->dist_uvs[uv_tx];
+      const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist);
+      if (uv_rd > search_state->best_rd) {
+        search_state->skip_intra_modes = 1;
+        return INT64_MAX;
+      }
     }
 
     rd_stats_uv->rate = search_state->rate_uv_tokenonly[uv_tx];
@@ -10277,10 +11004,7 @@ static int handle_intra_mode(InterModeSearchState *search_state,
     }
     mbmi->angle_delta[PLANE_TYPE_UV] = search_state->uv_angle_delta[uv_tx];
   }
-
-  rd_stats->rate =
-      rd_stats_y->rate +
-      intra_mode_info_cost_y(cpi, x, mbmi, bsize, intra_mode_cost[mbmi->mode]);
+  rd_stats->rate = rd_stats_y->rate + mode_cost_y;
   if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
     // super_block_yrd above includes the cost of the tx_size in the
     // tokenonly rate, but for intra blocks, tx_size is always coded
@@ -10308,14 +11032,13 @@ static int handle_intra_mode(InterModeSearchState *search_state,
     rd_stats_y->rate = 0;
     rd_stats_uv->rate = 0;
     // Cost the skip mb case
-    rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][1];
+    rd_stats->rate += x->skip_cost[skip_ctx][1];
   } else {
     // Add in the cost of the no skip flag.
-    rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][0];
+    rd_stats->rate += x->skip_cost[skip_ctx][0];
   }
   // Calculate the final RD estimate for this mode.
-  int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-
+  const int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
   // Keep record of best intra rd
   if (this_rd < search_state->best_intra_rd) {
     search_state->best_intra_rd = this_rd;
@@ -10333,14 +11056,322 @@ static int handle_intra_mode(InterModeSearchState *search_state,
       search_state->best_pred_rd[i] =
           AOMMIN(search_state->best_pred_rd[i], this_rd);
   }
-  return 1;
+  return this_rd;
+}
+
+static void collect_single_states(MACROBLOCK *x,
+                                  InterModeSearchState *search_state,
+                                  const MB_MODE_INFO *const mbmi) {
+  int i, j;
+  const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
+  const PREDICTION_MODE this_mode = mbmi->mode;
+  const int dir = ref_frame <= GOLDEN_FRAME ? 0 : 1;
+  const int mode_offset = INTER_OFFSET(this_mode);
+  const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
+
+  // Simple rd
+  int64_t simple_rd = search_state->simple_rd[this_mode][0][ref_frame];
+  for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    int64_t rd = search_state->simple_rd[this_mode][ref_mv_idx][ref_frame];
+    if (rd < simple_rd) simple_rd = rd;
+  }
+
+  // Insertion sort of single_state
+  SingleInterModeState this_state_s = { simple_rd, ref_frame, 1 };
+  SingleInterModeState *state_s = search_state->single_state[dir][mode_offset];
+  i = search_state->single_state_cnt[dir][mode_offset];
+  for (j = i; j > 0 && state_s[j - 1].rd > this_state_s.rd; --j)
+    state_s[j] = state_s[j - 1];
+  state_s[j] = this_state_s;
+  search_state->single_state_cnt[dir][mode_offset]++;
+
+  // Modelled rd
+  int64_t modelled_rd = search_state->modelled_rd[this_mode][0][ref_frame];
+  for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    int64_t rd = search_state->modelled_rd[this_mode][ref_mv_idx][ref_frame];
+    if (rd < modelled_rd) modelled_rd = rd;
+  }
+
+  // Insertion sort of single_state_modelled
+  SingleInterModeState this_state_m = { modelled_rd, ref_frame, 1 };
+  SingleInterModeState *state_m =
+      search_state->single_state_modelled[dir][mode_offset];
+  i = search_state->single_state_modelled_cnt[dir][mode_offset];
+  for (j = i; j > 0 && state_m[j - 1].rd > this_state_m.rd; --j)
+    state_m[j] = state_m[j - 1];
+  state_m[j] = this_state_m;
+  search_state->single_state_modelled_cnt[dir][mode_offset]++;
+}
+
+static void analyze_single_states(const AV1_COMP *cpi,
+                                  InterModeSearchState *search_state) {
+  int i, j, dir, mode;
+  if (cpi->sf.prune_comp_search_by_single_result >= 1) {
+    for (dir = 0; dir < 2; ++dir) {
+      int64_t best_rd;
+      SingleInterModeState(*state)[FWD_REFS];
+
+      // Use the best rd of GLOBALMV or NEWMV to prune the unlikely
+      // reference frames for all the modes (NEARESTMV and NEARMV may not
+      // have same motion vectors). Always keep the best of each mode
+      // because it might form the best possible combination with other mode.
+      state = search_state->single_state[dir];
+      best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+                       state[INTER_OFFSET(GLOBALMV)][0].rd);
+      for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+        for (i = 1; i < search_state->single_state_cnt[dir][mode]; ++i) {
+          if (state[mode][i].rd != INT64_MAX &&
+              (state[mode][i].rd >> 1) > best_rd) {
+            state[mode][i].valid = 0;
+          }
+        }
+      }
+
+      state = search_state->single_state_modelled[dir];
+      best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+                       state[INTER_OFFSET(GLOBALMV)][0].rd);
+      for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+        for (i = 1; i < search_state->single_state_modelled_cnt[dir][mode];
+             ++i) {
+          if (state[mode][i].rd != INT64_MAX &&
+              (state[mode][i].rd >> 1) > best_rd) {
+            state[mode][i].valid = 0;
+          }
+        }
+      }
+    }
+  }
+
+  // Ordering by simple rd first, then by modelled rd
+  for (dir = 0; dir < 2; ++dir) {
+    for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+      const int state_cnt_s = search_state->single_state_cnt[dir][mode];
+      const int state_cnt_m =
+          search_state->single_state_modelled_cnt[dir][mode];
+      SingleInterModeState *state_s = search_state->single_state[dir][mode];
+      SingleInterModeState *state_m =
+          search_state->single_state_modelled[dir][mode];
+      int count = 0;
+      const int max_candidates = AOMMAX(state_cnt_s, state_cnt_m);
+      for (i = 0; i < state_cnt_s; ++i) {
+        if (state_s[i].rd == INT64_MAX) break;
+        if (state_s[i].valid)
+          search_state->single_rd_order[dir][mode][count++] =
+              state_s[i].ref_frame;
+      }
+      if (count < max_candidates) {
+        for (i = 0; i < state_cnt_m; ++i) {
+          if (state_m[i].rd == INT64_MAX) break;
+          if (state_m[i].valid) {
+            int ref_frame = state_m[i].ref_frame;
+            int match = 0;
+            // Check if existing already
+            for (j = 0; j < count; ++j) {
+              if (search_state->single_rd_order[dir][mode][j] == ref_frame) {
+                match = 1;
+                break;
+              }
+            }
+            if (!match) {
+              // Check if this ref_frame is removed in simple rd
+              int valid = 1;
+              for (j = 0; j < state_cnt_s; j++) {
+                if (ref_frame == state_s[j].ref_frame && !state_s[j].valid) {
+                  valid = 0;
+                  break;
+                }
+              }
+              if (valid)
+                search_state->single_rd_order[dir][mode][count++] = ref_frame;
+            }
+            if (count >= max_candidates) break;
+          }
+        }
+      }
+    }
+  }
+}
+
+static int compound_skip_get_candidates(
+    const AV1_COMP *cpi, const InterModeSearchState *search_state,
+    const int dir, const PREDICTION_MODE mode) {
+  const int mode_offset = INTER_OFFSET(mode);
+  const SingleInterModeState *state =
+      search_state->single_state[dir][mode_offset];
+  const SingleInterModeState *state_modelled =
+      search_state->single_state_modelled[dir][mode_offset];
+  int max_candidates = 0;
+  int candidates;
+
+  for (int i = 0; i < FWD_REFS; ++i) {
+    if (search_state->single_rd_order[dir][mode_offset][i] == NONE_FRAME) break;
+    max_candidates++;
+  }
+
+  candidates = max_candidates;
+  if (cpi->sf.prune_comp_search_by_single_result >= 2) {
+    candidates = AOMMIN(2, max_candidates);
+  }
+  if (cpi->sf.prune_comp_search_by_single_result >= 3) {
+    if (state[0].rd != INT64_MAX && state_modelled[0].rd != INT64_MAX &&
+        state[0].ref_frame == state_modelled[0].ref_frame)
+      candidates = 1;
+    if (mode == NEARMV || mode == GLOBALMV) candidates = 1;
+  }
+  return candidates;
+}
+
+static int compound_skip_by_single_states(
+    const AV1_COMP *cpi, const InterModeSearchState *search_state,
+    const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME ref_frame,
+    const MV_REFERENCE_FRAME second_ref_frame, const MACROBLOCK *x) {
+  const MV_REFERENCE_FRAME refs[2] = { ref_frame, second_ref_frame };
+  const int mode[2] = { compound_ref0_mode(this_mode),
+                        compound_ref1_mode(this_mode) };
+  const int mode_offset[2] = { INTER_OFFSET(mode[0]), INTER_OFFSET(mode[1]) };
+  const int mode_dir[2] = { refs[0] <= GOLDEN_FRAME ? 0 : 1,
+                            refs[1] <= GOLDEN_FRAME ? 0 : 1 };
+  int ref_searched[2] = { 0, 0 };
+  int ref_mv_match[2] = { 1, 1 };
+  int i, j;
+
+  for (i = 0; i < 2; ++i) {
+    const SingleInterModeState *state =
+        search_state->single_state[mode_dir[i]][mode_offset[i]];
+    const int state_cnt =
+        search_state->single_state_cnt[mode_dir[i]][mode_offset[i]];
+    for (j = 0; j < state_cnt; ++j) {
+      if (state[j].ref_frame == refs[i]) {
+        ref_searched[i] = 1;
+        break;
+      }
+    }
+  }
+
+  const int ref_set = get_drl_refmv_count(x, refs, this_mode);
+  for (i = 0; i < 2; ++i) {
+    if (mode[i] == NEARESTMV || mode[i] == NEARMV) {
+      const MV_REFERENCE_FRAME single_refs[2] = { refs[i], NONE_FRAME };
+      int idential = 1;
+      for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ref_mv_idx++) {
+        int_mv single_mv;
+        int_mv comp_mv;
+        get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, single_refs,
+                    x->mbmi_ext);
+        get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, refs, x->mbmi_ext);
+
+        idential &= (single_mv.as_int == comp_mv.as_int);
+        if (!idential) {
+          ref_mv_match[i] = 0;
+          break;
+        }
+      }
+    }
+  }
+
+  for (i = 0; i < 2; ++i) {
+    if (ref_searched[i] && ref_mv_match[i]) {
+      const int candidates =
+          compound_skip_get_candidates(cpi, search_state, mode_dir[i], mode[i]);
+      const MV_REFERENCE_FRAME *ref_order =
+          search_state->single_rd_order[mode_dir[i]][mode_offset[i]];
+      int match = 0;
+      for (j = 0; j < candidates; ++j) {
+        if (refs[i] == ref_order[j]) {
+          match = 1;
+          break;
+        }
+      }
+      if (!match) return 1;
+    }
+  }
+
+  return 0;
+}
+
+static INLINE int sf_check_is_drop_ref(const MODE_DEFINITION *mode,
+                                       InterModeSearchState *search_state) {
+  const MV_REFERENCE_FRAME ref_frame = mode->ref_frame[0];
+  const MV_REFERENCE_FRAME second_ref_frame = mode->ref_frame[1];
+  if (search_state->num_available_refs > 2) {
+    if ((ref_frame == search_state->dist_order_refs[0] &&
+         second_ref_frame == search_state->dist_order_refs[1]) ||
+        (ref_frame == search_state->dist_order_refs[1] &&
+         second_ref_frame == search_state->dist_order_refs[0]))
+      return 1;  // drop this pair of refs
+  }
+  return 0;
+}
+
+static INLINE void sf_drop_ref_analyze(InterModeSearchState *search_state,
+                                       const MODE_DEFINITION *mode,
+                                       int64_t distortion2) {
+  const PREDICTION_MODE this_mode = mode->mode;
+  MV_REFERENCE_FRAME ref_frame = mode->ref_frame[0];
+  const int idx = ref_frame - LAST_FRAME;
+  if (idx && distortion2 > search_state->dist_refs[idx]) {
+    search_state->dist_refs[idx] = distortion2;
+    search_state->dist_order_refs[idx] = ref_frame;
+  }
+
+  // Reach the last single ref prediction mode
+  if (ref_frame == ALTREF_FRAME && this_mode == GLOBALMV) {
+    // bubble sort dist_refs and the order index
+    for (int i = 0; i < REF_FRAMES; ++i) {
+      for (int k = i + 1; k < REF_FRAMES; ++k) {
+        if (search_state->dist_refs[i] < search_state->dist_refs[k]) {
+          int64_t tmp_dist = search_state->dist_refs[i];
+          search_state->dist_refs[i] = search_state->dist_refs[k];
+          search_state->dist_refs[k] = tmp_dist;
+
+          int tmp_idx = search_state->dist_order_refs[i];
+          search_state->dist_order_refs[i] = search_state->dist_order_refs[k];
+          search_state->dist_order_refs[k] = tmp_idx;
+        }
+      }
+    }
+    for (int i = 0; i < REF_FRAMES; ++i) {
+      if (search_state->dist_refs[i] == -1) break;
+      search_state->num_available_refs = i;
+    }
+    search_state->num_available_refs++;
+  }
+}
+
+static void alloc_compound_type_rd_buffers(AV1_COMMON *const cm,
+                                           CompoundTypeRdBuffers *const bufs) {
+  CHECK_MEM_ERROR(
+      cm, bufs->pred0,
+      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0)));
+  CHECK_MEM_ERROR(
+      cm, bufs->pred1,
+      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1)));
+  CHECK_MEM_ERROR(
+      cm, bufs->residual1,
+      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1)));
+  CHECK_MEM_ERROR(
+      cm, bufs->diff10,
+      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10)));
+  CHECK_MEM_ERROR(cm, bufs->tmp_best_mask_buf,
+                  (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE *
+                                        sizeof(*bufs->tmp_best_mask_buf)));
 }
 
-void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
+static void release_compound_type_rd_buffers(
+    CompoundTypeRdBuffers *const bufs) {
+  aom_free(bufs->pred0);
+  aom_free(bufs->pred1);
+  aom_free(bufs->residual1);
+  aom_free(bufs->diff10);
+  aom_free(bufs->tmp_best_mask_buf);
+  av1_zero(*bufs);  // Set all pointers to NULL for safety.
+}
+
+void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
                                MACROBLOCK *x, int mi_row, int mi_col,
                                RD_STATS *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) {
-  const AV1_COMMON *const cm = &cpi->common;
+  AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   const SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -10350,9 +11381,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const struct segmentation *const seg = &cm->seg;
   PREDICTION_MODE this_mode;
-  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   unsigned char segment_id = mbmi->segment_id;
-  int i, k;
+  int i;
   struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
   unsigned int ref_costs_single[REF_FRAMES];
   unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
@@ -10364,28 +11394,57 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   InterModeSearchState search_state;
   init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize,
                                best_rd_so_far);
-
+  INTERINTRA_MODE interintra_modes[REF_FRAMES] = {
+    INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES,
+    INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES
+  };
   HandleInterModeArgs args = {
     { NULL },  { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
     { NULL },  { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 },
     NULL,      NULL,
-    NULL,      NULL,
+    NULL,      search_state.modelled_rd,
     { { 0 } }, INT_MAX,
-    INT_MAX
+    INT_MAX,   search_state.simple_rd,
+    0,         interintra_modes
   };
   for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
 
   av1_invalid_rd_stats(rd_cost);
 
   // init params, set frame modes, speed features
-  set_params_rd_pick_inter_mode(cpi, x, &args, bsize, mi_row, mi_col,
-                                ref_frame_skip_mask, mode_skip_mask,
-                                ref_costs_single, ref_costs_comp, yv12_mb);
+  set_params_rd_pick_inter_mode(
+      cpi, x, &args, bsize, mi_row, mi_col, ref_frame_skip_mask, mode_skip_mask,
+      ctx->skip_ref_frame_mask, ref_costs_single, ref_costs_comp, yv12_mb);
 
 #if CONFIG_COLLECT_INTER_MODE_RD_STATS
   int64_t best_est_rd = INT64_MAX;
+  // TODO(angiebird): Turn this on when this speed feature is well tested
+#if 1
+  const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+  const int do_tx_search = !md->ready;
+#else
+  const int do_tx_search = 1;
+#endif
+  InterModesInfo *inter_modes_info = &tile_data->inter_modes_info;
+  inter_modes_info->num = 0;
 #endif
 
+  int intra_mode_num = 0;
+  int intra_mode_idx_ls[MAX_MODES];
+  int reach_first_comp_mode = 0;
+
+  // Temporary buffers used by handle_inter_mode().
+  // We allocate them once and reuse it in every call to that function.
+  // Note: Must be allocated on the heap due to large size of the arrays.
+  uint8_t *tmp_buf_orig;
+  CHECK_MEM_ERROR(
+      cm, tmp_buf_orig,
+      (uint8_t *)aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE));
+  uint8_t *const tmp_buf = get_buf_by_bd(xd, tmp_buf_orig);
+
+  CompoundTypeRdBuffers rd_buffers;
+  alloc_compound_type_rd_buffers(cm, &rd_buffers);
+
   for (int midx = 0; midx < MAX_MODES; ++midx) {
     int mode_index = mode_map[midx];
     int64_t this_rd = INT64_MAX;
@@ -10394,42 +11453,44 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     int64_t distortion2 = 0;
     int skippable = 0;
     int this_skip2 = 0;
-
-    this_mode = av1_mode_order[mode_index].mode;
-    ref_frame = av1_mode_order[mode_index].ref_frame[0];
-    second_ref_frame = av1_mode_order[mode_index].ref_frame[1];
+    const MODE_DEFINITION *mode_order = &av1_mode_order[mode_index];
+    const MV_REFERENCE_FRAME ref_frame = mode_order->ref_frame[0];
+    const MV_REFERENCE_FRAME second_ref_frame = mode_order->ref_frame[1];
+    const int comp_pred = second_ref_frame > INTRA_FRAME;
+    this_mode = mode_order->mode;
 
     init_mbmi(mbmi, mode_index, cm);
 
     x->skip = 0;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
 
-    if (inter_mode_search_order_independent_skip(cpi, x, bsize, mode_index,
-                                                 mi_row, mi_col, mode_skip_mask,
-                                                 ref_frame_skip_mask))
-      continue;
-
-    if (ref_frame == INTRA_FRAME) {
-      if (sf->skip_intra_in_interframe && search_state.skip_intra_modes)
-        continue;
+    // Reach the first compound prediction mode
+    if (sf->prune_comp_search_by_single_result > 0 && comp_pred &&
+        reach_first_comp_mode == 0) {
+      analyze_single_states(cpi, &search_state);
+      reach_first_comp_mode = 1;
     }
+    const int ret = inter_mode_search_order_independent_skip(
+        cpi, ctx, x, bsize, mode_index, mi_row, mi_col, mode_skip_mask,
+        ref_frame_skip_mask, &search_state);
+    if (ret == 1) continue;
+    args.skip_motion_mode = (ret == 2);
 
-    if (sf->drop_ref) {
-      if (ref_frame > INTRA_FRAME && second_ref_frame > INTRA_FRAME) {
-        if (search_state.num_available_refs > 2) {
-          if ((ref_frame == search_state.dist_order_refs[0] &&
-               second_ref_frame == search_state.dist_order_refs[1]) ||
-              (ref_frame == search_state.dist_order_refs[1] &&
-               second_ref_frame == search_state.dist_order_refs[0]))
-            continue;
-        }
+    if (sf->drop_ref && comp_pred) {
+      if (sf_check_is_drop_ref(mode_order, &search_state)) {
+        continue;
       }
     }
 
     if (search_state.best_rd < search_state.mode_threshold[mode_index])
       continue;
 
-    const int comp_pred = second_ref_frame > INTRA_FRAME;
+    if (sf->prune_comp_search_by_single_result > 0 && comp_pred) {
+      if (compound_skip_by_single_states(cpi, &search_state, this_mode,
+                                         ref_frame, second_ref_frame, x))
+        continue;
+    }
+
     const int ref_frame_cost = comp_pred
                                    ? ref_costs_comp[ref_frame][second_ref_frame]
                                    : ref_costs_single[ref_frame];
@@ -10474,18 +11535,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     }
 
     if (ref_frame == INTRA_FRAME) {
-      RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv;
-      const int ret = handle_intra_mode(
-          &search_state, cpi, x, bsize, ref_frame_cost, ctx, disable_skip,
-          &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv);
-      if (!ret) {
-        continue;
-      }
-      rate2 = intra_rd_stats.rate;
-      distortion2 = intra_rd_stats.dist;
-      this_rd = RDCOST(x->rdmult, rate2, distortion2);
-      skippable = intra_rd_stats.skip;
-      rate_y = intra_rd_stats_y.rate;
+      intra_mode_idx_ls[intra_mode_num++] = mode_index;
+      continue;
     } else {
       mbmi->angle_delta[PLANE_TYPE_Y] = 0;
       mbmi->angle_delta[PLANE_TYPE_UV] = 0;
@@ -10501,17 +11552,17 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         args.single_newmv = search_state.single_newmv;
         args.single_newmv_rate = search_state.single_newmv_rate;
         args.single_newmv_valid = search_state.single_newmv_valid;
-        args.modelled_rd = search_state.modelled_rd;
         args.single_comp_cost = real_compmode_cost;
         args.ref_frame_cost = ref_frame_cost;
 #if CONFIG_COLLECT_INTER_MODE_RD_STATS
-        this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
-                                    &rd_stats_uv, &disable_skip, mi_row, mi_col,
-                                    &args, ref_best_rd, &best_est_rd);
+        this_rd = handle_inter_mode(
+            cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &disable_skip,
+            mi_row, mi_col, &args, ref_best_rd, tmp_buf, &rd_buffers, tile_data,
+            &best_est_rd, do_tx_search, inter_modes_info);
 #else
         this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
                                     &rd_stats_uv, &disable_skip, mi_row, mi_col,
-                                    &args, ref_best_rd);
+                                    &args, ref_best_rd, tmp_buf, &rd_buffers);
 #endif
         rate2 = rd_stats.rate;
         skippable = rd_stats.skip;
@@ -10520,6 +11571,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         rate_uv = rd_stats_uv.rate;
       }
 
+      if (sf->prune_comp_search_by_single_result > 0 &&
+          is_inter_singleref_mode(this_mode)) {
+        collect_single_states(x, &search_state, mbmi);
+      }
+
       if (this_rd == INT64_MAX) continue;
 
       this_skip2 = mbmi->skip;
@@ -10554,10 +11610,24 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         search_state.best_mbmode = *mbmi;
         search_state.best_skip2 = this_skip2;
         search_state.best_mode_skippable = skippable;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+        if (do_tx_search) {
+          // When do_tx_search == 0, handle_inter_mode won't provide correct
+          // rate_y and rate_uv because txfm_search process is replaced by
+          // rd estimation.
+          // Therfore, we should avoid updating best_rate_y and best_rate_uv
+          // here. These two values will be updated when txfm_search is called
+          search_state.best_rate_y =
+              rate_y +
+              x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable];
+          search_state.best_rate_uv = rate_uv;
+        }
+#else   // CONFIG_COLLECT_INTER_MODE_RD_STATS
         search_state.best_rate_y =
             rate_y +
             x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable];
         search_state.best_rate_uv = rate_uv;
+#endif  // CONFIG_COLLECT_INTER_MODE_RD_STATS
         memcpy(ctx->blk_skip, x->blk_skip,
                sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
       }
@@ -10588,43 +11658,124 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       if (hybrid_rd < search_state.best_pred_rd[REFERENCE_MODE_SELECT])
         search_state.best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
     }
+    if (sf->drop_ref && second_ref_frame == NONE_FRAME) {
+      // Collect data from single ref mode, and analyze data.
+      sf_drop_ref_analyze(&search_state, mode_order, distortion2);
+    }
 
-    if (sf->drop_ref) {
-      if (second_ref_frame == NONE_FRAME) {
-        const int idx = ref_frame - LAST_FRAME;
-        if (idx && distortion2 > search_state.dist_refs[idx]) {
-          search_state.dist_refs[idx] = distortion2;
-          search_state.dist_order_refs[idx] = ref_frame;
-        }
+    if (x->skip && !comp_pred) break;
+  }
 
-        // Reach the last single ref prediction mode
-        if (ref_frame == ALTREF_FRAME && this_mode == GLOBALMV) {
-          // bubble sort dist_refs and the order index
-          for (i = 0; i < REF_FRAMES; ++i) {
-            for (k = i + 1; k < REF_FRAMES; ++k) {
-              if (search_state.dist_refs[i] < search_state.dist_refs[k]) {
-                int64_t tmp_dist = search_state.dist_refs[i];
-                search_state.dist_refs[i] = search_state.dist_refs[k];
-                search_state.dist_refs[k] = tmp_dist;
-
-                int tmp_idx = search_state.dist_order_refs[i];
-                search_state.dist_order_refs[i] =
-                    search_state.dist_order_refs[k];
-                search_state.dist_order_refs[k] = tmp_idx;
-              }
-            }
-          }
+  aom_free(tmp_buf_orig);
+  tmp_buf_orig = NULL;
+  release_compound_type_rd_buffers(&rd_buffers);
 
-          for (i = 0; i < REF_FRAMES; ++i) {
-            if (search_state.dist_refs[i] == -1) break;
-            search_state.num_available_refs = i;
-          }
-          search_state.num_available_refs++;
-        }
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+  if (!do_tx_search) {
+    inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
+    search_state.best_rd = INT64_MAX;
+
+    int64_t top_est_rd =
+        inter_modes_info->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx];
+    for (int j = 0; j < inter_modes_info->num; ++j) {
+      const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx;
+      *mbmi = inter_modes_info->mbmi_arr[data_idx];
+      int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx];
+      if (curr_est_rd * 0.9 > top_est_rd) {
+        continue;
+      }
+      const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
+
+      x->skip = 0;
+      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+      // Select prediction reference frames.
+      const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+      for (i = 0; i < num_planes; i++) {
+        xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+        if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+      }
+
+      RD_STATS rd_stats;
+      RD_STATS rd_stats_y;
+      RD_STATS rd_stats_uv;
+
+      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+      if (mbmi->motion_mode == OBMC_CAUSAL)
+        av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+
+      if (!txfm_search(cpi, x, bsize, mi_row, mi_col, &rd_stats, &rd_stats_y,
+                       &rd_stats_uv, mode_rate, search_state.best_rd)) {
+        continue;
+      } else {
+        const int skip_ctx = av1_get_skip_context(xd);
+        inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse,
+                             rd_stats.dist,
+                             rd_stats_y.rate + rd_stats_uv.rate +
+                                 x->skip_cost[skip_ctx][mbmi->skip]);
+      }
+      rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+
+      if (rd_stats.rdcost < search_state.best_rd) {
+        search_state.best_rd = rd_stats.rdcost;
+        // Note index of best mode so far
+        const int mode_index = get_prediction_mode_idx(
+            mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+        search_state.best_mode_index = mode_index;
+        *rd_cost = rd_stats;
+        search_state.best_rd = rd_stats.rdcost;
+        search_state.best_mbmode = *mbmi;
+        search_state.best_skip2 = mbmi->skip;
+        search_state.best_mode_skippable = rd_stats.skip;
+        search_state.best_rate_y =
+            rd_stats_y.rate +
+            x->skip_cost[av1_get_skip_context(xd)][rd_stats.skip || mbmi->skip];
+        search_state.best_rate_uv = rd_stats_uv.rate;
+        memcpy(ctx->blk_skip, x->blk_skip,
+               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
       }
     }
+  }
+#endif
 
-    if (x->skip && !comp_pred) break;
+  for (int j = 0; j < intra_mode_num; ++j) {
+    const int mode_index = intra_mode_idx_ls[j];
+    const MV_REFERENCE_FRAME ref_frame =
+        av1_mode_order[mode_index].ref_frame[0];
+    assert(av1_mode_order[mode_index].ref_frame[1] == NONE_FRAME);
+    assert(ref_frame == INTRA_FRAME);
+    if (sf->skip_intra_in_interframe && search_state.skip_intra_modes) break;
+    init_mbmi(mbmi, mode_index, cm);
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, INTRA_FRAME, NONE_FRAME);
+
+    // Select prediction reference frames.
+    for (i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+    }
+
+    RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv;
+
+    const int ref_frame_cost = ref_costs_single[ref_frame];
+    intra_rd_stats.rdcost = handle_intra_mode(
+        &search_state, cpi, x, bsize, mi_row, mi_col, ref_frame_cost, ctx, 0,
+        &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv);
+    if (intra_rd_stats.rdcost < search_state.best_rd) {
+      search_state.best_rd = intra_rd_stats.rdcost;
+      // Note index of best mode so far
+      search_state.best_mode_index = mode_index;
+      *rd_cost = intra_rd_stats;
+      search_state.best_rd = intra_rd_stats.rdcost;
+      search_state.best_mbmode = *mbmi;
+      search_state.best_skip2 = 0;
+      search_state.best_mode_skippable = intra_rd_stats.skip;
+      search_state.best_rate_y =
+          intra_rd_stats_y.rate +
+          x->skip_cost[av1_get_skip_context(xd)][intra_rd_stats.skip];
+      search_state.best_rate_uv = intra_rd_stats_uv.rate;
+      memcpy(ctx->blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+    }
   }
 
   // In effect only when speed >= 2.
@@ -10635,7 +11786,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 
   // Only try palette mode when the best mode so far is an intra mode.
   if (try_palette && !is_inter_mode(search_state.best_mbmode.mode)) {
-    search_palette_mode(cpi, x, rd_cost, ctx, bsize, mbmi, pmi,
+    search_palette_mode(cpi, x, mi_row, mi_col, rd_cost, ctx, bsize, mbmi, pmi,
                         ref_costs_single, &search_state);
   }
 
@@ -10776,11 +11927,11 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
   av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
   if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) {
     int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
-    mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+    mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
     // Select the samples according to motion vector difference
-    if (mbmi->num_proj_ref[0] > 1)
-      mbmi->num_proj_ref[0] = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
-                                            mbmi->num_proj_ref[0], bsize);
+    if (mbmi->num_proj_ref > 1)
+      mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+                                         mbmi->num_proj_ref, bsize);
   }
 
   set_default_interp_filters(mbmi, cm->interp_filter);
@@ -10853,7 +12004,7 @@ static INLINE void calc_target_weighted_pred_above(
   struct calc_target_weighted_pred_ctxt *ctxt =
       (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
 
-  const int bw = xd->n8_w << MI_SIZE_LOG2;
+  const int bw = xd->n4_w << MI_SIZE_LOG2;
   const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
 
   int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE);
@@ -10899,7 +12050,7 @@ static INLINE void calc_target_weighted_pred_left(
   struct calc_target_weighted_pred_ctxt *ctxt =
       (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
 
-  const int bw = xd->n8_w << MI_SIZE_LOG2;
+  const int bw = xd->n4_w << MI_SIZE_LOG2;
   const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
 
   int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw);
@@ -10982,8 +12133,8 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
                                       int above_stride, const uint8_t *left,
                                       int left_stride) {
   const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-  const int bw = xd->n8_w << MI_SIZE_LOG2;
-  const int bh = xd->n8_h << MI_SIZE_LOG2;
+  const int bw = xd->n4_w << MI_SIZE_LOG2;
+  const int bh = xd->n4_h << MI_SIZE_LOG2;
   int32_t *mask_buf = x->mask_buf;
   int32_t *wsrc_buf = x->wsrc_buf;
 
diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h
index 12df472c1..4c11f90b8 100644
--- a/third_party/aom/av1/encoder/rdopt.h
+++ b/third_party/aom/av1/encoder/rdopt.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_RDOPT_H_
-#define AV1_ENCODER_RDOPT_H_
+#ifndef AOM_AV1_ENCODER_RDOPT_H_
+#define AOM_AV1_ENCODER_RDOPT_H_
 
 #include "av1/common/blockd.h"
 #include "av1/common/txb_common.h"
@@ -25,6 +25,10 @@ extern "C" {
 #endif
 
 #define MAX_REF_MV_SERCH 3
+#define DEFAULT_LUMA_INTERP_SKIP_FLAG 1
+#define DEFAULT_CHROMA_INTERP_SKIP_FLAG 2
+#define DEFAULT_INTERP_SKIP_FLAG \
+  (DEFAULT_LUMA_INTERP_SKIP_FLAG | DEFAULT_CHROMA_INTERP_SKIP_FLAG)
 
 struct TileInfo;
 struct macroblock;
@@ -111,7 +115,7 @@ unsigned int av1_high_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
                                                 const struct buf_2d *ref,
                                                 BLOCK_SIZE bs, int bd);
 
-void av1_rd_pick_inter_mode_sb(const struct AV1_COMP *cpi,
+void av1_rd_pick_inter_mode_sb(struct AV1_COMP *cpi,
                                struct TileDataEnc *tile_data,
                                struct macroblock *x, int mi_row, int mi_col,
                                struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
@@ -123,14 +127,12 @@ void av1_rd_pick_inter_mode_sb_seg_skip(
     BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
 
 #if CONFIG_COLLECT_INTER_MODE_RD_STATS
-#define INTER_MODE_RD_TEST 0
-void av1_inter_mode_data_init();
-void av1_inter_mode_data_fit(int rdmult);
-void av1_inter_mode_data_show(const AV1_COMMON *cm);
+void av1_inter_mode_data_init(struct TileDataEnc *tile_data);
+void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult);
 #endif
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_RDOPT_H_
+#endif  // AOM_AV1_ENCODER_RDOPT_H_
diff --git a/third_party/aom/av1/encoder/reconinter_enc.c b/third_party/aom/av1/encoder/reconinter_enc.c
new file mode 100644
index 000000000..23d920fc3
--- /dev/null
+++ b/third_party/aom/av1/encoder/reconinter_enc.c
@@ -0,0 +1,627 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/obmc.h"
+#include "av1/encoder/reconinter_enc.h"
+
+static INLINE void calc_subpel_params(
+    MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv,
+    int plane, const int pre_x, const int pre_y, int x, int y,
+    struct buf_2d *const pre_buf, uint8_t **pre, SubpelParams *subpel_params,
+    int bw, int bh) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int is_scaled = av1_is_scaled(sf);
+  if (is_scaled) {
+    int ssx = pd->subsampling_x;
+    int ssy = pd->subsampling_y;
+    int orig_pos_y = (pre_y + y) << SUBPEL_BITS;
+    orig_pos_y += mv.row * (1 << (1 - ssy));
+    int orig_pos_x = (pre_x + x) << SUBPEL_BITS;
+    orig_pos_x += mv.col * (1 << (1 - ssx));
+    int pos_y = sf->scale_value_y(orig_pos_y, sf);
+    int pos_x = sf->scale_value_x(orig_pos_x, sf);
+    pos_x += SCALE_EXTRA_OFF;
+    pos_y += SCALE_EXTRA_OFF;
+
+    const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+    const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+    const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                       << SCALE_SUBPEL_BITS;
+    const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+    pos_y = clamp(pos_y, top, bottom);
+    pos_x = clamp(pos_x, left, right);
+
+    *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+           (pos_x >> SCALE_SUBPEL_BITS);
+    subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
+    subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
+    subpel_params->xs = sf->x_step_q4;
+    subpel_params->ys = sf->y_step_q4;
+  } else {
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(
+        xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+    subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
+    subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+    subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+    *pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
+           (x + (mv_q4.col >> SUBPEL_BITS));
+  }
+}
+
+static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                          int plane, const MB_MODE_INFO *mi,
+                                          int build_for_obmc, int bw, int bh,
+                                          int mi_x, int mi_y) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int is_compound = has_second_ref(mi);
+  int ref;
+  const int is_intrabc = is_intrabc_block(mi);
+  assert(IMPLIES(is_intrabc, !is_compound));
+  int is_global[2] = { 0, 0 };
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
+  }
+
+  const BLOCK_SIZE bsize = mi->sb_type;
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) ||
+                     (block_size_high[bsize] < 8 && ss_y);
+
+  if (is_intrabc) sub8x8_inter = 0;
+
+  // For sub8x8 chroma blocks, we may be covering more than one luma block's
+  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+  // the top-left corner of the prediction source - the correct top-left corner
+  // is at (pre_x, pre_y).
+  const int row_start =
+      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
+  const int col_start =
+      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
+  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+  sub8x8_inter = sub8x8_inter && !build_for_obmc;
+  if (sub8x8_inter) {
+    for (int row = row_start; row <= 0 && sub8x8_inter; ++row) {
+      for (int col = col_start; col <= 0; ++col) {
+        const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+        if (!is_inter_block(this_mbmi)) sub8x8_inter = 0;
+        if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0;
+      }
+    }
+  }
+
+  if (sub8x8_inter) {
+    // block size
+    const int b4_w = block_size_wide[bsize] >> ss_x;
+    const int b4_h = block_size_high[bsize] >> ss_y;
+    const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
+    const int b8_w = block_size_wide[plane_bsize] >> ss_x;
+    const int b8_h = block_size_high[plane_bsize] >> ss_y;
+    assert(!is_compound);
+
+    const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] };
+
+    int row = row_start;
+    for (int y = 0; y < b8_h; y += b4_h) {
+      int col = col_start;
+      for (int x = 0; x < b8_w; x += b4_w) {
+        MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+        is_compound = has_second_ref(this_mbmi);
+        int tmp_dst_stride = 8;
+        assert(bw < 8 || bh < 8);
+        ConvolveParams conv_params = get_conv_params_no_round(
+            0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
+        conv_params.use_jnt_comp_avg = 0;
+        struct buf_2d *const dst_buf = &pd->dst;
+        uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
+
+        ref = 0;
+        const RefBuffer *ref_buf =
+            &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
+
+        pd->pre[ref].buf0 =
+            (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer;
+        pd->pre[ref].buf =
+            pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
+                                                     ref_buf->buf->uv_stride,
+                                                     &ref_buf->sf);
+        pd->pre[ref].width = ref_buf->buf->uv_crop_width;
+        pd->pre[ref].height = ref_buf->buf->uv_crop_height;
+        pd->pre[ref].stride = ref_buf->buf->uv_stride;
+
+        const struct scale_factors *const sf =
+            is_intrabc ? &cm->sf_identity : &ref_buf->sf;
+        struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+
+        const MV mv = this_mbmi->mv[ref].as_mv;
+
+        uint8_t *pre;
+        SubpelParams subpel_params;
+        WarpTypesAllowed warp_types;
+        warp_types.global_warp_allowed = is_global[ref];
+        warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL;
+
+        calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
+                           &subpel_params, bw, bh);
+        conv_params.do_average = ref;
+        if (is_masked_compound_type(mi->interinter_comp.type)) {
+          // masked compound type has its own average mechanism
+          conv_params.do_average = 0;
+        }
+
+        av1_make_inter_predictor(
+            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf,
+            b4_w, b4_h, &conv_params, this_mbmi->interp_filters, &warp_types,
+            (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y,
+            plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
+
+        ++col;
+      }
+      ++row;
+    }
+
+    for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref];
+    return;
+  }
+
+  {
+    ConvolveParams conv_params = get_conv_params_no_round(
+        0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
+    av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
+                               &conv_params.bck_offset,
+                               &conv_params.use_jnt_comp_avg, is_compound);
+
+    struct buf_2d *const dst_buf = &pd->dst;
+    uint8_t *const dst = dst_buf->buf;
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      const struct scale_factors *const sf =
+          is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
+      struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+      const MV mv = mi->mv[ref].as_mv;
+
+      uint8_t *pre;
+      SubpelParams subpel_params;
+      calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf, &pre,
+                         &subpel_params, bw, bh);
+
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global[ref];
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+      if (ref && is_masked_compound_type(mi->interinter_comp.type)) {
+        // masked compound type has its own average mechanism
+        conv_params.do_average = 0;
+        av1_make_masked_inter_predictor(
+            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
+            bh, &conv_params, mi->interp_filters, plane, &warp_types,
+            mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, ref, xd,
+            cm->allow_warped_motion);
+      } else {
+        conv_params.do_average = ref;
+        av1_make_inter_predictor(
+            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
+            bh, &conv_params, mi->interp_filters, &warp_types,
+            mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, plane, ref,
+            mi, build_for_obmc, xd, cm->allow_warped_motion);
+      }
+    }
+  }
+}
+
+static void build_inter_predictors_for_planes(const AV1_COMMON *cm,
+                                              MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int mi_row, int mi_col,
+                                              int plane_from, int plane_to) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    const int bw = pd->width;
+    const int bh = pd->height;
+
+    if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                             pd->subsampling_y))
+      continue;
+
+    build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y);
+  }
+}
+
+void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    int mi_row, int mi_col, BUFFER_SET *ctx,
+                                    BLOCK_SIZE bsize) {
+  av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize, 0);
+}
+
+void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                     int mi_row, int mi_col, BUFFER_SET *ctx,
+                                     BLOCK_SIZE bsize) {
+  for (int plane_idx = 1; plane_idx < MAX_MB_PLANE; plane_idx++) {
+    av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize,
+                                   plane_idx);
+  }
+}
+
+void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    int mi_row, int mi_col, BUFFER_SET *ctx,
+                                    BLOCK_SIZE bsize, int plane_idx) {
+  build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, plane_idx,
+                                    plane_idx);
+
+  if (is_interintra_pred(xd->mi[0])) {
+    BUFFER_SET default_ctx = { { NULL, NULL, NULL }, { 0, 0, 0 } };
+    if (!ctx) {
+      default_ctx.plane[plane_idx] = xd->plane[plane_idx].dst.buf;
+      default_ctx.stride[plane_idx] = xd->plane[plane_idx].dst.stride;
+      ctx = &default_ctx;
+    }
+    av1_build_interintra_predictors_sbp(cm, xd, xd->plane[plane_idx].dst.buf,
+                                        xd->plane[plane_idx].dst.stride, ctx,
+                                        plane_idx, bsize);
+  }
+}
+
+void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                   int mi_row, int mi_col, BUFFER_SET *ctx,
+                                   BLOCK_SIZE bsize) {
+  const int num_planes = av1_num_planes(cm);
+  av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
+  if (num_planes > 1)
+    av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
+}
+
+// TODO(sarahparker):
+// av1_build_inter_predictor should be combined with
+// av1_make_inter_predictor
+void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+                               int dst_stride, const MV *src_mv,
+                               const struct scale_factors *sf, int w, int h,
+                               ConvolveParams *conv_params,
+                               InterpFilters interp_filters,
+                               const WarpTypesAllowed *warp_types, int p_col,
+                               int p_row, int plane, int ref,
+                               enum mv_precision precision, int x, int y,
+                               const MACROBLOCKD *xd, int can_use_previous) {
+  const int is_q4 = precision == MV_PRECISION_Q4;
+  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
+                     is_q4 ? src_mv->col : src_mv->col * 2 };
+  MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
+  mv.col += SCALE_EXTRA_OFF;
+  mv.row += SCALE_EXTRA_OFF;
+
+  const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                       mv.col & SCALE_SUBPEL_MASK,
+                                       mv.row & SCALE_SUBPEL_MASK };
+  src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
+         (mv.col >> SCALE_SUBPEL_BITS);
+
+  av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf,
+                           w, h, conv_params, interp_filters, warp_types, p_col,
+                           p_row, plane, ref, xd->mi[0], 0, xd,
+                           can_use_previous);
+}
+
+static INLINE void build_prediction_by_above_pred(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+    MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  const int above_mi_col = ctxt->mi_col + rel_mi_col;
+  int mi_x, mi_y;
+  MB_MODE_INFO backup_mbmi = *above_mbmi;
+
+  av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width,
+                                           above_mbmi, ctxt, num_planes);
+  mi_x = above_mi_col << MI_SIZE_LOG2;
+  mi_y = ctxt->mi_row << MI_SIZE_LOG2;
+
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+  for (int j = 0; j < num_planes; ++j) {
+    const struct macroblockd_plane *pd = &xd->plane[j];
+    int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
+    int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
+                   block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
+
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+    build_inter_predictors(ctxt->cm, xd, j, above_mbmi, 1, bw, bh, mi_x, mi_y);
+  }
+  *above_mbmi = backup_mbmi;
+}
+
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         int mi_row, int mi_col,
+                                         uint8_t *tmp_buf[MAX_MB_PLANE],
+                                         int tmp_width[MAX_MB_PLANE],
+                                         int tmp_height[MAX_MB_PLANE],
+                                         int tmp_stride[MAX_MB_PLANE]) {
+  if (!xd->up_available) return;
+
+  // Adjust mb_to_bottom_edge to have the correct value for the OBMC
+  // prediction block. This is half the height of the original block,
+  // except for 128-wide blocks, where we only use a height of 32.
+  int this_height = xd->n4_h * MI_SIZE;
+  int pred_height = AOMMIN(this_height / 2, 32);
+  xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
+
+  struct build_prediction_ctxt ctxt = { cm,         mi_row,
+                                        mi_col,     tmp_buf,
+                                        tmp_width,  tmp_height,
+                                        tmp_stride, xd->mb_to_right_edge };
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  foreach_overlappable_nb_above(cm, xd, mi_col,
+                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
+                                build_prediction_by_above_pred, &ctxt);
+
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = ctxt.mb_to_far_edge;
+  xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
+}
+
+static INLINE void build_prediction_by_left_pred(
+    MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height,
+    MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  const int left_mi_row = ctxt->mi_row + rel_mi_row;
+  int mi_x, mi_y;
+  MB_MODE_INFO backup_mbmi = *left_mbmi;
+
+  av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height,
+                                          left_mbmi, ctxt, num_planes);
+  mi_x = ctxt->mi_col << MI_SIZE_LOG2;
+  mi_y = left_mi_row << MI_SIZE_LOG2;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+  for (int j = 0; j < num_planes; ++j) {
+    const struct macroblockd_plane *pd = &xd->plane[j];
+    int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
+                   block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
+    int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y;
+
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+    build_inter_predictors(ctxt->cm, xd, j, left_mbmi, 1, bw, bh, mi_x, mi_y);
+  }
+  *left_mbmi = backup_mbmi;
+}
+
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        int mi_row, int mi_col,
+                                        uint8_t *tmp_buf[MAX_MB_PLANE],
+                                        int tmp_width[MAX_MB_PLANE],
+                                        int tmp_height[MAX_MB_PLANE],
+                                        int tmp_stride[MAX_MB_PLANE]) {
+  if (!xd->left_available) return;
+
+  // Adjust mb_to_right_edge to have the correct value for the OBMC
+  // prediction block. This is half the width of the original block,
+  // except for 128-wide blocks, where we only use a width of 32.
+  int this_width = xd->n4_w * MI_SIZE;
+  int pred_width = AOMMIN(this_width / 2, 32);
+  xd->mb_to_right_edge += (this_width - pred_width) * 8;
+
+  struct build_prediction_ctxt ctxt = { cm,         mi_row,
+                                        mi_col,     tmp_buf,
+                                        tmp_width,  tmp_height,
+                                        tmp_stride, xd->mb_to_bottom_edge };
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  foreach_overlappable_nb_left(cm, xd, mi_row,
+                               max_neighbor_obmc[mi_size_high_log2[bsize]],
+                               build_prediction_by_left_pred, &ctxt);
+
+  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_right_edge -= (this_width - pred_width) * 8;
+  xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
+}
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        int mi_row, int mi_col) {
+  const int num_planes = av1_num_planes(cm);
+  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+  int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
+    dst_buf1[1] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
+    dst_buf1[2] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
+    dst_buf2[1] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
+    dst_buf2[2] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
+  } else {
+    dst_buf1[0] = xd->tmp_obmc_bufs[0];
+    dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
+    dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
+    dst_buf2[0] = xd->tmp_obmc_bufs[1];
+    dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
+    dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
+  }
+  av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
+                                      dst_width1, dst_height1, dst_stride1);
+  av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
+                                     dst_width2, dst_height2, dst_stride2);
+  av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, get_frame_new_buffer(cm),
+                       mi_row, mi_col, 0, num_planes);
+  av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
+                                  dst_buf2, dst_stride2);
+}
+
+// Builds the inter-predictor for the single ref case
+// for use in the encoder to search the wedges efficiently.
+static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
+                                              int bw, int bh, int x, int y,
+                                              int w, int h, int mi_x, int mi_y,
+                                              int ref, uint8_t *const ext_dst,
+                                              int ext_dst_stride,
+                                              int can_use_previous) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MB_MODE_INFO *mi = xd->mi[0];
+
+  const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+  struct buf_2d *const pre_buf = &pd->pre[ref];
+  uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x;
+  const MV mv = mi->mv[ref].as_mv;
+
+  ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
+  WarpTypesAllowed warp_types;
+  const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+  warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype);
+  warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+  const int pre_x = (mi_x) >> pd->subsampling_x;
+  const int pre_y = (mi_y) >> pd->subsampling_y;
+  uint8_t *pre;
+  SubpelParams subpel_params;
+  calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
+                     &subpel_params, bw, bh);
+
+  av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride,
+                           &subpel_params, sf, w, h, &conv_params,
+                           mi->interp_filters, &warp_types, pre_x + x,
+                           pre_y + y, plane, ref, mi, 0, xd, can_use_previous);
+}
+
+void av1_build_inter_predictors_for_planes_single_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
+    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
+    int can_use_previous) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(
+        bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+    build_inter_predictors_single_buf(xd, plane, bw, bh, 0, 0, bw, bh, mi_x,
+                                      mi_y, ref, ext_dst[plane],
+                                      ext_dst_stride[plane], can_use_previous);
+  }
+}
+
+static void build_masked_compound(
+    uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
+    const uint8_t *src1, int src1_stride,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+    int w) {
+  // Derive subsampling from h and w passed in. May be refactored to
+  // pass in subsampling factors directly.
+  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+  aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                     mask, block_size_wide[sb_type], w, h, subw, subh);
+}
+
+static void build_masked_compound_highbd(
+    uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
+    const uint8_t *src1_8, int src1_stride,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+    int w, int bd) {
+  // Derive subsampling from h and w passed in. May be refactored to
+  // pass in subsampling factors directly.
+  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+  // const uint8_t *mask =
+  //     av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
+  aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                            src1_stride, mask, block_size_wide[sb_type], w, h,
+                            subw, subh, bd);
+}
+
+static void build_wedge_inter_predictor_from_buf(
+    MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0,
+    int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_compound = has_second_ref(mbmi);
+  MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+  struct buf_2d *const dst_buf = &pd->dst;
+  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+  mbmi->interinter_comp.seg_mask = xd->seg_mask;
+  const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
+
+  if (is_compound && is_masked_compound_type(comp_data->type)) {
+    if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+        av1_build_compound_diffwtd_mask_highbd(
+            comp_data->seg_mask, comp_data->mask_type,
+            CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+            CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
+      else
+        av1_build_compound_diffwtd_mask(
+            comp_data->seg_mask, comp_data->mask_type, ext_dst0,
+            ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
+    }
+
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      build_masked_compound_highbd(
+          dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
+          mbmi->sb_type, h, w, xd->bd);
+    else
+      build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
+                            ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
+                            h, w);
+  } else {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+                               dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
+                               xd->bd);
+    else
+      aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
+                        0, NULL, 0, w, h);
+  }
+}
+
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int plane_from, int plane_to,
+                                              uint8_t *ext_dst0[3],
+                                              int ext_dst_stride0[3],
+                                              uint8_t *ext_dst1[3],
+                                              int ext_dst_stride1[3]) {
+  int plane;
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(
+        bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+    build_wedge_inter_predictor_from_buf(
+        xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane],
+        ext_dst1[plane], ext_dst_stride1[plane]);
+  }
+}
diff --git a/third_party/aom/av1/encoder/reconinter_enc.h b/third_party/aom/av1/encoder/reconinter_enc.h
new file mode 100644
index 000000000..10d5e8c28
--- /dev/null
+++ b/third_party/aom/av1/encoder/reconinter_enc.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RECONINTER_ENC_H_
+#define AOM_AV1_ENCODER_RECONINTER_ENC_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/filter.h"
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/convolve.h"
+#include "av1/common/warped_motion.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    int mi_row, int mi_col, BUFFER_SET *ctx,
+                                    BLOCK_SIZE bsize);
+
+void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                     int mi_row, int mi_col, BUFFER_SET *ctx,
+                                     BLOCK_SIZE bsize);
+
+void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    int mi_row, int mi_col, BUFFER_SET *ctx,
+                                    BLOCK_SIZE bsize, int plane_idx);
+
+void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                   int mi_row, int mi_col, BUFFER_SET *ctx,
+                                   BLOCK_SIZE bsize);
+
+void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+                               int dst_stride, const MV *src_mv,
+                               const struct scale_factors *sf, int w, int h,
+                               ConvolveParams *conv_params,
+                               InterpFilters interp_filters,
+                               const WarpTypesAllowed *warp_types, int p_col,
+                               int p_row, int plane, int ref,
+                               enum mv_precision precision, int x, int y,
+                               const MACROBLOCKD *xd, int can_use_previous);
+
+// Detect if the block have sub-pixel level motion vectors
+// per component.
+#define CHECK_SUBPEL 0
+static INLINE int has_subpel_mv_component(const MB_MODE_INFO *const mbmi,
+                                          const MACROBLOCKD *const xd,
+                                          int dir) {
+#if CHECK_SUBPEL
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  int plane;
+  int ref = (dir >> 1);
+
+  if (dir & 0x01) {
+    if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK) return 1;
+  } else {
+    if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1;
+  }
+
+  return 0;
+#else
+  (void)mbmi;
+  (void)xd;
+  (void)dir;
+  return 1;
+#endif
+}
+
+static INLINE int av1_is_interp_search_needed(const MACROBLOCKD *const xd) {
+  MB_MODE_INFO *const mi = xd->mi[0];
+  const int is_compound = has_second_ref(mi);
+  int ref;
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    int row_col;
+    for (row_col = 0; row_col < 2; ++row_col) {
+      const int dir = (ref << 1) + row_col;
+      if (has_subpel_mv_component(mi, xd, dir)) {
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         int mi_row, int mi_col,
+                                         uint8_t *tmp_buf[MAX_MB_PLANE],
+                                         int tmp_width[MAX_MB_PLANE],
+                                         int tmp_height[MAX_MB_PLANE],
+                                         int tmp_stride[MAX_MB_PLANE]);
+
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        int mi_row, int mi_col,
+                                        uint8_t *tmp_buf[MAX_MB_PLANE],
+                                        int tmp_width[MAX_MB_PLANE],
+                                        int tmp_height[MAX_MB_PLANE],
+                                        int tmp_stride[MAX_MB_PLANE]);
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        int mi_row, int mi_col);
+
+void av1_build_inter_predictors_for_planes_single_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
+    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
+    int can_use_previous);
+
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int plane_from, int plane_to,
+                                              uint8_t *ext_dst0[3],
+                                              int ext_dst_stride0[3],
+                                              uint8_t *ext_dst1[3],
+                                              int ext_dst_stride1[3]);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RECONINTER_ENC_H_
diff --git a/third_party/aom/av1/encoder/segmentation.h b/third_party/aom/av1/encoder/segmentation.h
index a207b0f26..1ad13d66a 100644
--- a/third_party/aom/av1/encoder/segmentation.h
+++ b/third_party/aom/av1/encoder/segmentation.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_SEGMENTATION_H_
-#define AV1_ENCODER_SEGMENTATION_H_
+#ifndef AOM_AV1_ENCODER_SEGMENTATION_H_
+#define AOM_AV1_ENCODER_SEGMENTATION_H_
 
 #include "av1/common/blockd.h"
 #include "av1/encoder/encoder.h"
@@ -35,4 +35,4 @@ void av1_reset_segment_features(AV1_COMMON *cm);
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_SEGMENTATION_H_
+#endif  // AOM_AV1_ENCODER_SEGMENTATION_H_
diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c
index d4b4b19c4..4c35baae0 100644
--- a/third_party/aom/av1/encoder/speed_features.c
+++ b/third_party/aom/av1/encoder/speed_features.c
@@ -98,6 +98,15 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
     sf->use_square_partition_only_threshold = BLOCK_64X64;
   }
 
+  // TODO(huisu@google.com): train models for 720P and above.
+  if (!is_720p_or_larger) {
+    sf->ml_partition_search_breakout_thresh[0] = 200;  // BLOCK_8X8
+    sf->ml_partition_search_breakout_thresh[1] = 250;  // BLOCK_16X16
+    sf->ml_partition_search_breakout_thresh[2] = 300;  // BLOCK_32X32
+    sf->ml_partition_search_breakout_thresh[3] = 500;  // BLOCK_64X64
+    sf->ml_partition_search_breakout_thresh[4] = -1;   // BLOCK_128X128
+  }
+
   if (speed >= 1) {
     if (is_720p_or_larger) {
       sf->use_square_partition_only_threshold = BLOCK_128X128;
@@ -106,6 +115,14 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
     } else {
       sf->use_square_partition_only_threshold = BLOCK_32X32;
     }
+
+    if (!is_720p_or_larger) {
+      sf->ml_partition_search_breakout_thresh[0] = 200;  // BLOCK_8X8
+      sf->ml_partition_search_breakout_thresh[1] = 250;  // BLOCK_16X16
+      sf->ml_partition_search_breakout_thresh[2] = 300;  // BLOCK_32X32
+      sf->ml_partition_search_breakout_thresh[3] = 300;  // BLOCK_64X64
+      sf->ml_partition_search_breakout_thresh[4] = -1;   // BLOCK_128X128
+    }
   }
 
   if (speed >= 2) {
@@ -126,13 +143,11 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
   if (speed >= 3) {
     if (is_720p_or_larger) {
       sf->disable_split_mask = DISABLE_ALL_SPLIT;
-      sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
       sf->partition_search_breakout_dist_thr = (1 << 25);
       sf->partition_search_breakout_rate_thr = 200;
     } else {
       sf->max_intra_bsize = BLOCK_32X32;
       sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
-      sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0;
       sf->partition_search_breakout_dist_thr = (1 << 23);
       sf->partition_search_breakout_rate_thr = 120;
     }
@@ -166,6 +181,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
   // Speed 0 for all speed features that give neutral coding performance change.
   sf->reduce_inter_modes = 1;
   sf->prune_ext_partition_types_search_level = 1;
+  sf->ml_prune_rect_partition = 1;
   sf->ml_prune_ab_partition = 1;
   sf->ml_prune_4_partition = 1;
   sf->adaptive_txb_search_level = 1;
@@ -173,6 +189,11 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
   sf->model_based_prune_tx_search_level = 1;
   sf->model_based_post_interp_filter_breakout = 1;
   sf->inter_mode_rd_model_estimation = 1;
+  sf->prune_ref_frame_for_rect_partitions =
+      !(boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame);
+  sf->less_rectangular_check_level = 1;
+  sf->gm_search_type = GM_REDUCED_REF_SEARCH;
+  sf->gm_disable_recode = 1;
 
   if (speed >= 1) {
     sf->gm_erroradv_type = GM_ERRORADV_TR_1;
@@ -182,8 +203,10 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->intra_tx_size_search_init_depth_rect = 1;
     sf->intra_tx_size_search_init_depth_sqr = 1;
     sf->tx_size_search_lgr_block = 1;
-    sf->two_pass_partition_search = 1;
-    sf->mode_pruning_based_on_two_pass_partition_search = 1;
+    if (speed >= CONFIG_2PASS_PARTITION_SEARCH_LVL) {
+      sf->two_pass_partition_search = 1;
+      sf->mode_pruning_based_on_two_pass_partition_search = 1;
+    }
     sf->prune_ext_partition_types_search_level = 2;
     sf->use_fast_interpolation_filter_search = 1;
     sf->skip_repeat_interpolation_filter_search = 1;
@@ -198,6 +221,11 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->use_intra_txb_hash = 1;
     sf->optimize_b_precheck = 1;
     sf->dual_sgr_penalty_level = 1;
+    sf->use_accurate_subpel_search = 1;
+    sf->reuse_inter_intra_mode = 1;
+    sf->prune_comp_search_by_single_result = 1;
+    sf->skip_repeated_newmv = 1;
+    sf->obmc_full_pixel_search_level = 1;
   }
 
   if (speed >= 2) {
@@ -206,7 +234,6 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->selective_ref_frame = 2;
     sf->fast_cdef_search = 1;
 
-    sf->use_rd_breakout = 1;
     sf->adaptive_rd_thresh = 1;
     sf->mv.auto_mv_step_size = 1;
     sf->mv.subpel_iters_per_step = 1;
@@ -224,8 +251,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
 
   if (speed >= 3) {
     sf->tx_size_search_method = boosted ? USE_FULL_RD : USE_LARGESTALL;
-    sf->less_rectangular_check = 1;
-    sf->mode_skip_start = 10;
+    sf->less_rectangular_check_level = 2;
     sf->adaptive_pred_interp_filter = 1;
     // adaptive_motion_search breaks encoder multi-thread tests.
     // The values in x->pred_mv[] differ for single and multi-thread cases.
@@ -237,6 +263,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->adaptive_rd_thresh = 2;
     sf->tx_type_search.prune_mode = PRUNE_2D_FAST;
     sf->gm_search_type = GM_DISABLE_SEARCH;
+    sf->prune_comp_search_by_single_result = 2;
   }
 
   if (speed >= 4) {
@@ -250,10 +277,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->adaptive_pred_interp_filter = 0;
     sf->adaptive_mode_search = 1;
     sf->cb_partition_search = !boosted;
-    sf->cb_pred_filter_search = 1;
     sf->alt_ref_search_fp = 1;
-    sf->mode_skip_start = 6;
-    sf->adaptive_interp_filter_search = 1;
   }
 
   if (speed >= 5) {
@@ -276,7 +300,6 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
                   FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
                   FLAG_EARLY_TERMINATE;
     sf->disable_filter_search_var_thresh = 200;
-    sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
     sf->use_fast_coef_costing = 1;
     sf->partition_search_breakout_rate_thr = 300;
     sf->use_transform_domain_distortion = 2;
@@ -296,33 +319,17 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->simple_model_rd_from_var = 1;
   }
   if (speed >= 7) {
-    const int is_keyframe = cm->frame_type == KEY_FRAME;
-    const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
     sf->default_max_partition_size = BLOCK_32X32;
     sf->default_min_partition_size = BLOCK_8X8;
     sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->frame_parameter_update = 0;
     sf->mv.search_method = FAST_HEX;
-    sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW;
-    sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
-    sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
-    sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST;
-    sf->inter_mode_mask[BLOCK_64X128] = INTER_NEAREST;
-    sf->inter_mode_mask[BLOCK_128X64] = INTER_NEAREST;
-    sf->inter_mode_mask[BLOCK_128X128] = INTER_NEAREST;
     sf->partition_search_type = REFERENCE_PARTITION;
-    sf->reuse_inter_pred_sby = 1;
-    sf->force_frame_boost =
-        is_keyframe ||
-        (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
-    sf->max_delta_qindex = is_keyframe ? 20 : 15;
-    sf->coeff_prob_appx_step = 4;
     sf->mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
   }
   if (speed >= 8) {
     sf->mv.search_method = FAST_DIAMOND;
-    sf->mv.fullpel_search_step_param = 10;
     sf->mv.subpel_force_stop = 2;
     sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
   }
@@ -356,54 +363,6 @@ void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
     cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
 }
 
-static void set_dev_sf(AV1_COMP *cpi, SPEED_FEATURES *sf, int speed) {
-  AV1_COMMON *const cm = &cpi->common;
-
-  if (speed & TXFM_CODING_SF) {
-    sf->inter_tx_size_search_init_depth_rect = 1;
-    sf->inter_tx_size_search_init_depth_sqr = 1;
-    sf->intra_tx_size_search_init_depth_rect = 1;
-    sf->intra_tx_size_search_init_depth_sqr = 1;
-    sf->tx_size_search_method = USE_FAST_RD;
-    sf->tx_type_search.fast_intra_tx_type_search = 1;
-    sf->tx_type_search.fast_inter_tx_type_search = 1;
-  }
-
-  if (speed & INTER_PRED_SF) {
-    sf->selective_ref_frame = 2;
-    // sf->adaptive_motion_search = 1;
-    sf->mv.auto_mv_step_size = 1;
-    sf->adaptive_rd_thresh = 1;
-    sf->mv.subpel_iters_per_step = 1;
-    sf->adaptive_pred_interp_filter = 1;
-  }
-
-  if (speed & INTRA_PRED_SF) {
-    sf->max_intra_bsize = BLOCK_32X32;
-  }
-
-  if (speed & PARTITION_SF) {
-    if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
-        has_internal_image_edge(cpi)) {
-      sf->use_square_partition_only_threshold =
-          frame_is_boosted(cpi) ? BLOCK_128X128 : BLOCK_4X4;
-    } else {
-      sf->use_square_partition_only_threshold =
-          frame_is_intra_only(cm) ? BLOCK_128X128 : BLOCK_4X4;
-    }
-    sf->less_rectangular_check = 1;
-    sf->prune_ext_partition_types_search_level = 2;
-  }
-
-  if (speed & LOOP_FILTER_SF) {
-    sf->fast_cdef_search = 1;
-  }
-
-  if (speed & RD_SKIP_SF) {
-    sf->use_rd_breakout = 1;
-  }
-}
-
 void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   SPEED_FEATURES *const sf = &cpi->sf;
@@ -432,9 +391,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
 #endif  // DISABLE_TRELLISQ_SEARCH
   sf->gm_erroradv_type = GM_ERRORADV_TR_0;
   sf->mv.reduce_first_step_size = 0;
-  sf->coeff_prob_appx_step = 1;
   sf->mv.auto_mv_step_size = 0;
-  sf->mv.fullpel_search_step_param = 6;
   sf->comp_inter_joint_search_thresh = BLOCK_4X4;
   sf->adaptive_rd_thresh = 0;
   sf->tx_size_search_method = USE_FULL_RD;
@@ -450,7 +407,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->adaptive_motion_search = 0;
   sf->adaptive_pred_interp_filter = 0;
   sf->adaptive_mode_search = 0;
-  sf->cb_pred_filter_search = 0;
   sf->cb_partition_search = 0;
   sf->alt_ref_search_fp = 0;
   sf->partition_search_type = SEARCH_PARTITION;
@@ -461,22 +417,19 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->tx_type_search.fast_inter_tx_type_search = 0;
   sf->tx_type_search.skip_tx_search = 0;
   sf->selective_ref_frame = 0;
-  sf->less_rectangular_check = 0;
+  sf->less_rectangular_check_level = 0;
   sf->use_square_partition_only_threshold = BLOCK_128X128;
+  sf->prune_ref_frame_for_rect_partitions = 0;
   sf->auto_min_max_partition_size = NOT_IN_USE;
   sf->rd_auto_partition_min_limit = BLOCK_4X4;
   sf->default_max_partition_size = BLOCK_LARGEST;
   sf->default_min_partition_size = BLOCK_4X4;
   sf->adjust_partitioning_from_last_frame = 0;
-  sf->last_partitioning_redo_frequency = 4;
   sf->disable_split_mask = 0;
   sf->mode_search_skip_flags = 0;
-  sf->force_frame_boost = 0;
-  sf->max_delta_qindex = 0;
   sf->disable_filter_search_var_thresh = 0;
-  sf->adaptive_interp_filter_search = 0;
   sf->allow_partition_search_skip = 0;
-  sf->use_accurate_subpel_search = 1;
+  sf->use_accurate_subpel_search = 2;
   sf->disable_wedge_search_var_thresh = 0;
   sf->fast_wedge_sign_estimate = 0;
   sf->drop_ref = 0;
@@ -491,48 +444,46 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->optimize_b_precheck = 0;
   sf->jnt_comp_fast_tx_search = 0;
   sf->jnt_comp_skip_mv_search = 0;
+  sf->reuse_inter_intra_mode = 0;
 
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
     sf->intra_uv_mode_mask[i] = UV_INTRA_ALL;
   }
-  sf->use_rd_breakout = 0;
   sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
-  sf->use_fast_coef_updates = TWO_LOOP;
   sf->use_fast_coef_costing = 0;
-  sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
-  sf->schedule_mode_search = 0;
-  for (i = 0; i < BLOCK_SIZES_ALL; ++i) sf->inter_mode_mask[i] = INTER_ALL;
   sf->max_intra_bsize = BLOCK_LARGEST;
-  sf->reuse_inter_pred_sby = 0;
   // This setting only takes effect when partition_search_type is set
   // to FIXED_PARTITION.
   sf->always_this_block_size = BLOCK_16X16;
-  sf->search_type_check_frequency = 50;
   // Recode loop tolerance %.
   sf->recode_tolerance = 25;
-  sf->default_interp_filter = SWITCHABLE;
   sf->partition_search_breakout_dist_thr = 0;
   sf->partition_search_breakout_rate_thr = 0;
   sf->simple_model_rd_from_var = 0;
   sf->prune_ext_partition_types_search_level = 0;
+  sf->ml_prune_rect_partition = 0;
   sf->ml_prune_ab_partition = 0;
   sf->ml_prune_4_partition = 0;
   sf->fast_cdef_search = 0;
+  for (i = 0; i < PARTITION_BLOCK_SIZES; ++i)
+    sf->ml_partition_search_breakout_thresh[i] = -1;  // -1 means not enabled.
 
   // Set this at the appropriate speed levels
   sf->use_transform_domain_distortion = 0;
   sf->gm_search_type = GM_FULL_SEARCH;
+  sf->gm_disable_recode = 0;
   sf->use_fast_interpolation_filter_search = 0;
   sf->skip_repeat_interpolation_filter_search = 0;
   sf->use_hash_based_trellis = 0;
+  sf->prune_comp_search_by_single_result = 0;
+  sf->skip_repeated_newmv = 0;
 
   // Set decoder side speed feature to use less dual sgr modes
   sf->dual_sgr_penalty_level = 0;
 
   sf->inter_mode_rd_model_estimation = 0;
-
-  set_dev_sf(cpi, sf, oxcf->dev_sf);
+  sf->obmc_full_pixel_search_level = 0;
 
   if (oxcf->mode == GOOD)
     set_good_speed_features_framesize_independent(cpi, sf, oxcf->speed);
@@ -599,10 +550,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   x->min_partition_size = sf->default_min_partition_size;
   x->max_partition_size = sf->default_max_partition_size;
 
-  if (!cpi->oxcf.frame_periodic_boost) {
-    sf->max_delta_qindex = 0;
-  }
-
   // This is only used in motion vector unit test.
   if (cpi->oxcf.motion_vector_unit_test == 1)
     cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
@@ -611,5 +558,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
 
 #if CONFIG_DIST_8X8
   if (sf->use_transform_domain_distortion > 0) cpi->oxcf.using_dist_8x8 = 0;
+
+  if (cpi->oxcf.using_dist_8x8) x->min_partition_size = BLOCK_8X8;
 #endif  // CONFIG_DIST_8X8
 }
diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h
index d0408ba2f..41013b2e7 100644
--- a/third_party/aom/av1/encoder/speed_features.h
+++ b/third_party/aom/av1/encoder/speed_features.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_SPEED_FEATURES_H_
-#define AV1_ENCODER_SPEED_FEATURES_H_
+#ifndef AOM_AV1_ENCODER_SPEED_FEATURES_H_
+#define AOM_AV1_ENCODER_SPEED_FEATURES_H_
 
 #include "av1/common/enums.h"
 
@@ -54,25 +54,6 @@ enum {
               (1 << NEWMV) | (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) |
               (1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) |
               (1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) | (1 << GLOBAL_GLOBALMV),
-  INTER_NEAREST = (1 << NEARESTMV) | (1 << NEAREST_NEARESTMV) |
-                  (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV),
-  INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV) |
-                      (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) |
-                      (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
-                      (1 << NEW_NEARMV) | (1 << NEAR_NEWMV),
-  INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << GLOBALMV) |
-                       (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
-                       (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV),
-  INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << GLOBALMV) | (1 << NEWMV) |
-                           (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
-                           (1 << NEW_NEWMV) | (1 << NEW_NEARESTMV) |
-                           (1 << NEAREST_NEWMV) | (1 << NEW_NEARMV) |
-                           (1 << NEAR_NEWMV),
-  INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV) |
-                           (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) |
-                           (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
-                           (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) |
-                           (1 << NEAR_NEARMV),
   INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) |
                             (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
                             (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) |
@@ -132,11 +113,6 @@ typedef enum {
   // Other methods to come
 } SUBPEL_SEARCH_METHODS;
 
-typedef enum {
-  NO_MOTION_THRESHOLD = 0,
-  LOW_MOTION_THRESHOLD = 7
-} MOTION_THRESHOLD;
-
 typedef enum {
   USE_FULL_RD = 0,
   USE_FAST_RD,
@@ -178,12 +154,6 @@ typedef enum {
   FLAG_SKIP_INTRA_LOWVAR = 1 << 5,
 } MODE_SEARCH_SKIP_LOGIC;
 
-typedef enum {
-  FLAG_SKIP_EIGHTTAP_REGULAR = 1 << EIGHTTAP_REGULAR,
-  FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH,
-  FLAG_SKIP_MULTITAP_SHARP = 1 << MULTITAP_SHARP,
-} INTERP_FILTER_MASK;
-
 typedef enum {
   NO_PRUNE = 0,
   // eliminates one tx type in vertical and horizontal direction
@@ -224,16 +194,6 @@ typedef enum {
   REFERENCE_PARTITION
 } PARTITION_SEARCH_TYPE;
 
-typedef enum {
-  // Does a dry run to see if any of the contexts need to be updated or not,
-  // before the final run.
-  TWO_LOOP = 0,
-
-  // No dry run, also only half the coef contexts and bands are updated.
-  // The rest are not updated at all.
-  ONE_LOOP_REDUCED = 1
-} FAST_COEFF_UPDATE;
-
 typedef struct MV_SPEED_FEATURES {
   // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
   SEARCH_METHODS search_method;
@@ -257,9 +217,6 @@ typedef struct MV_SPEED_FEATURES {
 
   // Control when to stop subpel search
   int subpel_force_stop;
-
-  // This variable sets the step_param used in full pel motion search.
-  int fullpel_search_step_param;
 } MV_SPEED_FEATURES;
 
 #define MAX_MESH_STEP 4
@@ -332,13 +289,6 @@ typedef struct SPEED_FEATURES {
   // mode to be evaluated. A high value means we will be faster.
   int adaptive_rd_thresh;
 
-  // Coefficient probability model approximation step size
-  int coeff_prob_appx_step;
-
-  // The threshold is to determine how slow the motino is, it is used when
-  // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION
-  MOTION_THRESHOLD lf_motion_threshold;
-
   // Determine which method we use to determine transform size. We can choose
   // between options like full rd, largest for prediction size, largest
   // for intra and model coefs for the rest.
@@ -355,11 +305,6 @@ typedef struct SPEED_FEATURES {
   // largest transform only, since the largest transform block size is 64x64.
   int tx_size_search_lgr_block;
 
-  // After looking at the first set of modes (set by index here), skip
-  // checking modes for reference frames that don't match the reference frame
-  // of the best so far.
-  int mode_skip_start;
-
   PARTITION_SEARCH_TYPE partition_search_type;
 
   TX_TYPE_SEARCH tx_type_search;
@@ -397,6 +342,9 @@ typedef struct SPEED_FEATURES {
   // aggressiveness of pruning in order.
   int prune_ext_partition_types_search_level;
 
+  // Use a ML model to prune horz and vert partitions
+  int ml_prune_rect_partition;
+
   // Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions.
   int ml_prune_ab_partition;
 
@@ -413,12 +361,16 @@ typedef struct SPEED_FEATURES {
   int mode_pruning_based_on_two_pass_partition_search;
 
   // Skip rectangular partition test when partition type none gives better
-  // rd than partition type split.
-  int less_rectangular_check;
+  // rd than partition type split. Can take values 0 - 2, 0 referring to no
+  // skipping, and 1 - 2 increasing aggressiveness of skipping in order.
+  int less_rectangular_check_level;
 
   // Use square partition only beyond this block size.
   BLOCK_SIZE use_square_partition_only_threshold;
 
+  // Prune reference frames for rectangular partitions.
+  int prune_ref_frame_for_rect_partitions;
+
   // Sets min and max partition sizes for this superblock based on the
   // same superblock in last encoded frame, and the left and above neighbor.
   AUTO_MIN_MAX_MODE auto_min_max_partition_size;
@@ -435,10 +387,6 @@ typedef struct SPEED_FEATURES {
   // frame's partitioning. Only used if use_lastframe_partitioning is set.
   int adjust_partitioning_from_last_frame;
 
-  // How frequently we re do the partitioning from scratch. Only used if
-  // use_lastframe_partitioning is set.
-  int last_partitioning_redo_frequency;
-
   // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
   // it always, to allow it for only Last frame and Intra, disable it for all
   // inter modes or to enable it always.
@@ -461,8 +409,6 @@ typedef struct SPEED_FEATURES {
   // Pattern to be used for any exhaustive mesh searches.
   MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
 
-  int schedule_mode_search;
-
   // Allows sub 8x8 modes to use the prediction filter that was determined
   // best for 8x8 mode. If set to 0 we always re check all the filters for
   // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter
@@ -472,20 +418,10 @@ typedef struct SPEED_FEATURES {
   // Adaptive prediction mode search
   int adaptive_mode_search;
 
-  // Chessboard pattern prediction filter type search
-  int cb_pred_filter_search;
-
   int cb_partition_search;
 
   int alt_ref_search_fp;
 
-  // Use finer quantizer in every other few frames that run variable block
-  // partition type search.
-  int force_frame_boost;
-
-  // Maximally allowed base quantization index fluctuation.
-  int max_delta_qindex;
-
   // Implements various heuristics to skip searching modes
   // The heuristics selected are based on  flags
   // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
@@ -506,22 +442,9 @@ typedef struct SPEED_FEATURES {
   int intra_y_mode_mask[TX_SIZES];
   int intra_uv_mode_mask[TX_SIZES];
 
-  // This variable enables an early break out of mode testing if the model for
-  // rd built from the prediction signal indicates a value that's much
-  // higher than the best rd we've seen so far.
-  int use_rd_breakout;
-
   // This feature controls how the loop filter level is determined.
   LPF_PICK_METHOD lpf_pick;
 
-  // This feature limits the number of coefficients updates we actually do
-  // by only looking at counts from 1/2 the bands.
-  FAST_COEFF_UPDATE use_fast_coef_updates;
-
-  // A binary mask indicating if NEARESTMV, NEARMV, GLOBALMV, NEWMV
-  // modes are used in order from LSB to MSB for each BLOCK_SIZE.
-  int inter_mode_mask[BLOCK_SIZES_ALL];
-
   // This feature controls whether we do the expensive context update and
   // calculation in the rd coefficient costing loop.
   int use_fast_coef_costing;
@@ -535,28 +458,13 @@ typedef struct SPEED_FEATURES {
   // TODO(aconverse): Fold this into one of the other many mode skips
   BLOCK_SIZE max_intra_bsize;
 
-  // The frequency that we check if
-  // FIXED_PARTITION search type should be used.
-  int search_type_check_frequency;
-
-  // When partition is pre-set, the inter prediction result from pick_inter_mode
-  // can be reused in final block encoding process. It is enabled only for real-
-  // time mode speed 6.
-  int reuse_inter_pred_sby;
-
-  // default interp filter choice
-  InterpFilter default_interp_filter;
-
-  // adaptive interp_filter search to allow skip of certain filter types.
-  int adaptive_interp_filter_search;
-
-  // mask for skip evaluation of certain interp_filter type.
-  INTERP_FILTER_MASK interp_filter_search_mask;
-
   // Partition search early breakout thresholds.
   int64_t partition_search_breakout_dist_thr;
   int partition_search_breakout_rate_thr;
 
+  // Thresholds for ML based partition search breakout.
+  int ml_partition_search_breakout_thresh[PARTITION_BLOCK_SIZES];
+
   // Allow skipping partition search for still image frame
   int allow_partition_search_skip;
 
@@ -577,6 +485,9 @@ typedef struct SPEED_FEATURES {
 
   GM_SEARCH_TYPE gm_search_type;
 
+  // whether to disable the global motion recode loop
+  int gm_disable_recode;
+
   // Do limited interpolation filter search for dual filters, since best choice
   // usually includes EIGHTTAP_REGULAR.
   int use_fast_interpolation_filter_search;
@@ -624,6 +535,25 @@ typedef struct SPEED_FEATURES {
 
   // Dynamically estimate final rd from prediction error and mode cost
   int inter_mode_rd_model_estimation;
+
+  // Skip some ref frames in compound motion search by single motion search
+  // result. Has three levels for now: 0 referring to no skipping, and 1 - 3
+  // increasing aggressiveness of skipping in order.
+  // Note: The search order might affect the result. It is better to search same
+  // single inter mode as a group.
+  int prune_comp_search_by_single_result;
+
+  // Reuse the inter_intra_mode search result from NEARESTMV mode to other
+  // single ref modes
+  int reuse_inter_intra_mode;
+
+  // Set the full pixel search level of obmc
+  // 0: obmc_full_pixel_diamond
+  // 1: obmc_refining_search_sad (faster)
+  int obmc_full_pixel_search_level;
+
+  // flag to skip NEWMV mode in drl if the motion search result is the same
+  int skip_repeated_newmv;
 } SPEED_FEATURES;
 
 struct AV1_COMP;
@@ -635,4 +565,4 @@ void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi);
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_SPEED_FEATURES_H_
+#endif  // AOM_AV1_ENCODER_SPEED_FEATURES_H_
diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c
index d7e4f4eb3..75fdf02a5 100644
--- a/third_party/aom/av1/encoder/temporal_filter.c
+++ b/third_party/aom/av1/encoder/temporal_filter.c
@@ -25,6 +25,7 @@
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/ratectrl.h"
+#include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/temporal_filter.h"
 #include "aom_dsp/aom_dsp_common.h"
@@ -37,13 +38,12 @@ static void temporal_filter_predictors_mb_c(
     MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr,
     int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col,
     uint8_t *pred, struct scale_factors *scale, int x, int y,
-    int can_use_previous) {
-  const int which_mv = 0;
+    int can_use_previous, int num_planes) {
   const MV mv = { mv_row, mv_col };
   enum mv_precision mv_precision_uv;
   int uv_stride;
   // TODO(angiebird): change plane setting accordingly
-  ConvolveParams conv_params = get_conv_params(which_mv, 0, 0, xd->bd);
+  ConvolveParams conv_params = get_conv_params(0, 0, xd->bd);
   const InterpFilters interp_filters = xd->mi[0]->interp_filters;
   WarpTypesAllowed warp_types;
   memset(&warp_types, 0, sizeof(WarpTypesAllowed));
@@ -55,37 +55,21 @@ static void temporal_filter_predictors_mb_c(
     uv_stride = stride;
     mv_precision_uv = MV_PRECISION_Q3;
   }
-
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    av1_highbd_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale,
-                                     16, 16, which_mv, interp_filters,
-                                     &warp_types, x, y, 0, MV_PRECISION_Q3, x,
-                                     y, xd, can_use_previous);
-
-    av1_highbd_build_inter_predictor(
-        u_mb_ptr, uv_stride, &pred[256], uv_block_width, &mv, scale,
-        uv_block_width, uv_block_height, which_mv, interp_filters, &warp_types,
-        x, y, 1, mv_precision_uv, x, y, xd, can_use_previous);
-
-    av1_highbd_build_inter_predictor(
-        v_mb_ptr, uv_stride, &pred[512], uv_block_width, &mv, scale,
-        uv_block_width, uv_block_height, which_mv, interp_filters, &warp_types,
-        x, y, 2, mv_precision_uv, x, y, xd, can_use_previous);
-    return;
-  }
   av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16,
                             &conv_params, interp_filters, &warp_types, x, y, 0,
                             0, MV_PRECISION_Q3, x, y, xd, can_use_previous);
 
-  av1_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], uv_block_width,
-                            &mv, scale, uv_block_width, uv_block_height,
-                            &conv_params, interp_filters, &warp_types, x, y, 1,
-                            0, mv_precision_uv, x, y, xd, can_use_previous);
+  if (num_planes > 1) {
+    av1_build_inter_predictor(
+        u_mb_ptr, uv_stride, &pred[256], uv_block_width, &mv, scale,
+        uv_block_width, uv_block_height, &conv_params, interp_filters,
+        &warp_types, x, y, 1, 0, mv_precision_uv, x, y, xd, can_use_previous);
 
-  av1_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], uv_block_width,
-                            &mv, scale, uv_block_width, uv_block_height,
-                            &conv_params, interp_filters, &warp_types, x, y, 2,
-                            0, mv_precision_uv, x, y, xd, can_use_previous);
+    av1_build_inter_predictor(
+        v_mb_ptr, uv_stride, &pred[512], uv_block_width, &mv, scale,
+        uv_block_width, uv_block_height, &conv_params, interp_filters,
+        &warp_types, x, y, 2, 0, mv_precision_uv, x, y, xd, can_use_previous);
+  }
 }
 
 void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
@@ -214,7 +198,8 @@ void av1_highbd_temporal_filter_apply_c(
 static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
                                               uint8_t *arf_frame_buf,
                                               uint8_t *frame_ptr_buf,
-                                              int stride) {
+                                              int stride, int x_pos,
+                                              int y_pos) {
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
@@ -250,11 +235,9 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
   x->mvcost = x->mv_cost_stack;
   x->nmvjointcost = x->nmv_vec_cost;
 
-  // Use mv costing from x->mvcost directly
-  av1_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
-                 cond_cost_list(cpi, cost_list), &cpi->fn_ptr[BLOCK_16X16], 0,
-                 &best_ref_mv1);
-
+  av1_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param,
+                        NSTEP, 1, sadpb, cond_cost_list(cpi, cost_list),
+                        &best_ref_mv1, 0, 0, x_pos, y_pos, 0);
   x->mv_limits = tmp_mv_limits;
 
   // Ignore mv costing by sending NULL pointer instead of cost array
@@ -370,7 +353,8 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
           // Find best match in this frame by MC
           int err = temporal_filter_find_matching_mb_c(
               cpi, frames[alt_ref_index]->y_buffer + mb_y_offset,
-              frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride);
+              frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride,
+              mb_col * 16, mb_row * 16);
 
           // Assign higher weight to matching MB if it's error
           // score is lower. If not applying MC default behavior
@@ -386,7 +370,7 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
               frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride,
               mb_uv_width, mb_uv_height, mbd->mi[0]->mv[0].as_mv.row,
               mbd->mi[0]->mv[0].as_mv.col, predictor, scale, mb_col * 16,
-              mb_row * 16, cm->allow_warped_motion);
+              mb_row * 16, cm->allow_warped_motion, num_planes);
 
           // Apply the filter (YUV)
           if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -556,14 +540,6 @@ static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
     strength = group_boost / 300;
   }
 
-  // Adjustments for second level arf in multi arf case.
-  if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) {
-    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-    if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) {
-      strength >>= 1;
-    }
-  }
-
   *arnr_frames = frames;
   *arnr_strength = strength;
 }
@@ -593,21 +569,6 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance) {
 
   int which_arf = gf_group->arf_update_idx[gf_group->index];
 
-#if USE_GF16_MULTI_LAYER
-  if (cpi->rc.baseline_gf_interval == 16) {
-    // Identify the index to the current ARF.
-    const int num_arfs_in_gf = cpi->num_extra_arfs + 1;
-    int arf_idx;
-    for (arf_idx = 0; arf_idx < num_arfs_in_gf; arf_idx++) {
-      if (gf_group->index == cpi->arf_pos_in_gf[arf_idx]) {
-        which_arf = arf_idx;
-        break;
-      }
-    }
-    assert(arf_idx < num_arfs_in_gf);
-  }
-#endif  // USE_GF16_MULTI_LAYER
-
   // Set the temporal filtering status for the corresponding OVERLAY frame
   if (strength == 0 && frames_to_blur == 1)
     cpi->is_arf_filter_off[which_arf] = 1;
diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h
index bc0863a63..2ddc68b2c 100644
--- a/third_party/aom/av1/encoder/temporal_filter.h
+++ b/third_party/aom/av1/encoder/temporal_filter.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_TEMPORAL_FILTER_H_
-#define AV1_ENCODER_TEMPORAL_FILTER_H_
+#ifndef AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
+#define AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -22,4 +22,4 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance);
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_TEMPORAL_FILTER_H_
+#endif  // AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h
index de1cbe99c..63b505f36 100644
--- a/third_party/aom/av1/encoder/tokenize.h
+++ b/third_party/aom/av1/encoder/tokenize.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_TOKENIZE_H_
-#define AV1_ENCODER_TOKENIZE_H_
+#ifndef AOM_AV1_ENCODER_TOKENIZE_H_
+#define AOM_AV1_ENCODER_TOKENIZE_H_
 
 #include "av1/common/entropy.h"
 #include "av1/encoder/block.h"
@@ -70,4 +70,4 @@ static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id,
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_TOKENIZE_H_
+#endif  // AOM_AV1_ENCODER_TOKENIZE_H_
diff --git a/third_party/aom/av1/encoder/tx_prune_model_weights.h b/third_party/aom/av1/encoder/tx_prune_model_weights.h
index 69063b801..405bc9e6e 100644
--- a/third_party/aom/av1/encoder/tx_prune_model_weights.h
+++ b/third_party/aom/av1/encoder/tx_prune_model_weights.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
-#define AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+#ifndef AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -19,79 +19,114 @@ extern "C" {
 #include "av1/encoder/ml.h"
 
 // Tx type model for 4x4 block.
-static const float av1_tx_type_nn_weights_4x4_layer0[32] = {
-  0.72406f,  -0.40019f, 0.51795f,  -0.43881f, -0.49746f, -0.41780f, -0.39409f,
-  -0.16183f, -1.00135f, -0.41733f, -0.96534f, 0.93272f,  1.06229f,  0.04188f,
-  0.60919f,  0.92405f,  -0.39359f, 0.70570f,  0.75375f,  1.11966f,  -1.86360f,
-  -0.35421f, 0.18743f,  0.13346f,  -0.21262f, 0.07050f,  0.10533f,  -0.47402f,
-  1.33417f,  1.72899f,  1.17983f,  0.10552f,
+static const float av1_tx_type_nn_weights_4x4_hor_layer0[32] = {
+  -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f,
+  0.17996f,  1.20000f,  -0.27654f, 0.77396f,  1.21684f,  -1.75909f, -0.51272f,
+  -1.25923f, 0.35005f,  -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f,
+  2.73144f,  -0.16875f, -0.23482f, 0.02194f,  -0.26427f, 0.28049f,  0.21260f,
+  1.35792f,  0.27733f,  0.88660f,  -0.68304f,
 };
 
-static const float av1_tx_type_nn_bias_4x4_layer0[8] = {
-  1.96273f, -0.69845f, -0.10999f, -1.11311f,
-  1.35101f, 0.43842f,  -0.29264f, -1.15376f,
+static const float av1_tx_type_nn_bias_4x4_hor_layer0[8] = {
+  1.38742f, 0.59540f,  -1.37622f, 1.92114f,
+  0.00000f, -0.38998f, -0.32726f, -0.15650f,
 };
 
-static const float av1_tx_type_nn_weights_4x4_layer1[32] = {
-  0.79770f,  0.08520f,  0.23298f,  0.05285f,  0.87506f,  -0.90784f, -0.06197f,
-  -1.00580f, 0.68639f,  -0.34881f, 0.15366f,  -1.64658f, 0.80755f,  -0.26293f,
-  0.10253f,  -0.23915f, 1.14696f,  -0.10928f, -1.61377f, 0.00863f,  0.98599f,
-  -0.43872f, 0.61196f,  -0.03787f, 1.01060f,  0.17643f,  -0.00208f, -0.15738f,
-  0.06517f,  0.72885f,  0.24387f,  1.28535f,
+static const float av1_tx_type_nn_weights_4x4_hor_layer1[32] = {
+  1.65254f,  1.00915f,  -0.89318f, -2.05142f, -0.23235f, 0.96781f,  -0.37145f,
+  -0.21056f, 1.13891f,  0.38675f,  0.87739f,  -1.42697f, 0.48015f,  0.61883f,
+  -0.03979f, 0.11487f,  0.48042f,  0.45200f,  -0.23242f, 0.75166f,  0.55458f,
+  0.39452f,  -0.35285f, 1.59120f,  -1.49221f, -0.48349f, -0.64692f, 1.49297f,
+  -0.26782f, -0.65416f, -0.10648f, 0.05568f,
 };
 
-static const float av1_tx_type_nn_bias_4x4_layer1[4] = {
-  1.23769f,
-  1.40308f,
-  0.09871f,
-  1.82070f,
+static const float av1_tx_type_nn_bias_4x4_hor_layer1[4] = {
+  4.07177f,
+  3.26961f,
+  0.58083f,
+  1.21199f,
 };
 
-static const NN_CONFIG av1_tx_type_nnconfig_4x4 = {
+static const NN_CONFIG av1_tx_type_nnconfig_4x4_hor = {
   4,  // num_inputs
   4,  // num_outputs
   1,  // num_hidden_layers
   {
       8,
   },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x4_hor_layer0,
+    av1_tx_type_nn_weights_4x4_hor_layer1 },
+  { av1_tx_type_nn_bias_4x4_hor_layer0, av1_tx_type_nn_bias_4x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x4_ver_layer0[32] = {
+  -0.02032f, 2.61610f,  0.02098f,  -0.30217f, 0.12637f,  0.11017f,  -3.01996f,
+  0.35144f,  1.93776f,  -0.20463f, 1.64102f,  -1.41986f, -3.66717f, -0.51655f,
+  0.43910f,  0.37778f,  -1.02634f, 0.85337f,  -0.69753f, 1.00206f,  2.11784f,
+  1.89427f,  1.92919f,  0.43201f,  -1.67358f, -1.67035f, -1.54623f, 0.16714f,
+  -0.06589f, -0.28142f, -0.33118f, 1.72227f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_ver_layer0[8] = {
+  -0.33685f, 0.22025f,  0.28140f, 0.56138f,
+  0.93489f,  -1.77048f, 1.34989f, -0.93747f,
+};
+
+static const float av1_tx_type_nn_weights_4x4_ver_layer1[32] = {
+  -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f,  -2.39850f, -1.26457f,
+  0.75328f,  -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f,  -2.37739f,
+  -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f,
+  -0.99165f, -1.91366f, 0.16785f,  0.34776f,  0.58154f,  -0.18217f, -0.29257f,
+  -0.86315f, -0.53336f, 0.30320f,  -1.32331f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_ver_layer1[4] = {
+  -1.31519f,
+  -3.26321f,
+  1.71794f,
+  -1.90778f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x4_ver = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
   {
-      av1_tx_type_nn_weights_4x4_layer0,
-      av1_tx_type_nn_weights_4x4_layer1,
-  },
-  {
-      av1_tx_type_nn_bias_4x4_layer0,
-      av1_tx_type_nn_bias_4x4_layer1,
-  },
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x4_ver_layer0,
+    av1_tx_type_nn_weights_4x4_ver_layer1 },
+  { av1_tx_type_nn_bias_4x4_ver_layer0, av1_tx_type_nn_bias_4x4_ver_layer1 }
 };
 /******************************************************************************/
 
 // Tx type model for 4x8 block.
 static const float av1_tx_type_nn_weights_4x8_hor_layer0[32] = {
-  0.68355f,  -0.06887f, 0.68525f,  -0.86048f, -0.35906f, -0.28597f, -0.21108f,
-  0.12591f,  -1.13025f, -0.65695f, -0.25658f, 0.39155f,  0.89011f,  0.19258f,
-  0.28316f,  0.61172f,  0.52587f,  0.99182f,  0.75704f,  0.66788f,  -1.61814f,
-  -1.23483f, -0.62868f, -0.11902f, 0.33295f,  0.64796f,  0.92345f,  -0.71821f,
-  0.07575f,  0.34687f,  0.20518f,  -0.87850f,
+  0.00218f,  -0.41880f, -0.61215f, -0.92588f, 0.54291f,  -0.10898f, 0.70691f,
+  0.46819f,  -1.61598f, -0.08834f, -0.96839f, 1.18489f,  -0.45171f, -0.65445f,
+  -0.32179f, -0.10399f, 1.04379f,  0.91895f,  0.85589f,  0.08267f,  1.35388f,
+  -2.03096f, 0.08168f,  -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f,
+  -1.35896f, -1.17121f, 1.68866f,  0.10357f,
 };
 
 static const float av1_tx_type_nn_bias_4x8_hor_layer0[8] = {
-  1.14049f, -0.18583f, 1.92114f, -0.72057f,
-  1.32715f, 0.96713f,  1.09877f, -0.64345f,
+  2.93391f,  0.66831f, -0.21419f, 0.00000f,
+  -0.72878f, 0.15127f, -1.46755f, 0.16658f,
 };
 
 static const float av1_tx_type_nn_weights_4x8_hor_layer1[32] = {
-  0.71978f,  0.06896f,  1.48617f,  0.97124f,  -0.02487f, -0.95359f, 0.68983f,
-  -0.16313f, 0.51324f,  -0.33770f, 0.45938f,  -1.08238f, 0.72938f,  0.42300f,
-  0.85691f,  -0.03783f, 1.12617f,  -0.04034f, 0.36923f,  0.25638f,  1.10167f,
-  0.41633f,  0.72602f,  -0.14797f, 0.66888f,  0.11437f,  -0.99797f, -0.20725f,
-  1.01163f,  2.06308f,  1.23331f,  -0.15481f,
+  -1.52077f, -1.06243f, 0.35319f,  -0.49207f, 0.54524f,  0.44271f, 1.37117f,
+  -0.38957f, -1.28889f, -0.57133f, 0.04658f,  0.62278f,  0.37984f, 0.33247f,
+  1.65547f,  -0.56806f, -1.38645f, -0.76258f, 0.67926f,  0.08783f, -0.01443f,
+  0.34950f,  1.45812f,  -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f,
+  -0.50191f, 0.18219f,  1.83664f,  -0.75276f,
 };
 
 static const float av1_tx_type_nn_bias_4x8_hor_layer1[4] = {
-  2.14443f,
-  1.98356f,
-  0.74616f,
-  2.58795f,
+  -1.17455f,
+  -2.26089f,
+  -1.79863f,
+  -2.26333f,
 };
 
 static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = {
@@ -101,62 +136,57 @@ static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = {
   {
       8,
   },  // num_hidden_nodes
-  {
-      av1_tx_type_nn_weights_4x8_hor_layer0,
-      av1_tx_type_nn_weights_4x8_hor_layer1,
-  },
-  {
-      av1_tx_type_nn_bias_4x8_hor_layer0,
-      av1_tx_type_nn_bias_4x8_hor_layer1,
-  },
+  { av1_tx_type_nn_weights_4x8_hor_layer0,
+    av1_tx_type_nn_weights_4x8_hor_layer1 },
+  { av1_tx_type_nn_bias_4x8_hor_layer0, av1_tx_type_nn_bias_4x8_hor_layer1 }
 };
 
 static const float av1_tx_type_nn_weights_4x8_ver_layer0[128] = {
-  0.88859f,  1.02796f,  1.15509f,  0.61719f,  0.85804f,  1.17581f,  0.93524f,
-  0.06546f,  0.08018f,  -0.78562f, -0.36614f, 0.14149f,  -0.30069f, -0.52647f,
-  -0.82789f, 0.60527f,  -1.74026f, -0.20271f, 0.09875f,  0.03708f,  0.09430f,
-  -0.24043f, -0.38433f, 1.21014f,  1.42443f,  0.69586f,  1.07812f,  1.21748f,
-  1.10989f,  0.93122f,  1.04127f,  0.39424f,  0.95592f,  0.12904f,  0.46330f,
-  0.49722f,  0.46303f,  0.36979f,  0.60227f,  0.39345f,  -2.01632f, -0.05706f,
-  0.07766f,  -0.01271f, -0.16577f, -0.21957f, -0.14800f, 0.24898f,  0.27662f,
-  0.42028f,  0.44748f,  1.14585f,  1.38805f,  0.46182f,  -0.22982f, -0.07324f,
-  0.29886f,  -0.46959f, -0.04228f, -0.01064f, 0.24260f,  -0.32282f, -0.23804f,
-  1.44466f,  -0.42190f, -0.36385f, 0.39746f,  0.38557f,  -0.09624f, -0.21540f,
-  0.57385f,  -0.72878f, -0.39677f, -0.00717f, 0.60499f,  1.33849f,  1.05337f,
-  1.11947f,  0.38487f,  0.86534f,  -0.33970f, 0.71140f,  0.20772f,  0.61132f,
-  0.06181f,  -0.20027f, 0.13736f,  -0.72321f, 0.64586f,  -0.56740f, -0.90912f,
-  -0.20452f, 0.15381f,  -0.84346f, 0.19550f,  0.63164f,  1.35441f,  0.63218f,
-  0.82883f,  0.38803f,  -0.23874f, -0.02962f, 0.23846f,  -0.06822f, -0.40159f,
-  -0.17850f, -0.69524f, 1.12299f,  -0.08286f, -0.14150f, -0.28456f, -0.41519f,
-  -0.12792f, -0.55286f, 0.51655f,  0.06636f,  0.73759f,  0.70072f,  0.12616f,
-  0.31282f,  0.17130f,  -1.34233f, 0.37221f,  0.95838f,  0.16286f,  1.04301f,
-  0.73600f,  -0.11233f,
+  -0.00952f, -0.98858f, -0.93181f, 1.39594f,  0.96559f,  0.18162f,  -0.76064f,
+  -0.06066f, 0.07907f,  -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f,
+  -0.10982f, 0.18559f,  1.17049f,  1.11387f,  1.12697f,  1.05804f,  1.12764f,
+  1.06318f,  1.12052f,  0.17406f,  1.83157f,  0.19362f,  0.46910f,  0.39608f,
+  0.33342f,  0.40083f,  0.27645f,  1.06864f,  -4.06645f, -0.38775f, -0.11070f,
+  0.03781f,  -0.09141f, 0.06185f,  -0.04852f, 0.20163f,  0.16784f,  0.16641f,
+  -0.50941f, -0.61087f, 2.07008f,  -0.82381f, -0.85558f, 0.05528f,  -0.10535f,
+  -2.81150f, 0.67038f,  0.43643f,  0.49062f,  -0.04465f, 0.90438f,  0.00977f,
+  0.46272f,  1.59751f,  0.95234f,  0.35086f,  0.85624f,  0.73149f,  1.67779f,
+  -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f,
+  -1.24956f, 0.73797f,  1.23275f,  -0.60064f, -0.07851f, 0.14397f,  0.22110f,
+  -0.04422f, 0.14350f,  0.75926f,  0.35032f,  0.48104f,  2.81408f,  0.34662f,
+  0.42090f,  0.35521f,  -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f,
+  0.32299f,  0.23916f,  0.06032f,  -0.17844f, -0.17558f, -1.42746f, -0.55828f,
+  -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f,  -0.04973f,
+  -0.09273f, 1.04249f,  0.79235f,  1.13229f,  0.99617f,  0.03851f,  0.56334f,
+  0.90795f,  1.08296f,  0.58519f,  1.74765f,  0.63971f,  1.35951f,  0.07803f,
+  -0.05127f, 0.26514f,  -0.84629f, -0.66343f, -2.10630f, 0.11017f,  2.18528f,
+  -0.21958f, 0.05970f,
 };
 
 static const float av1_tx_type_nn_bias_4x8_ver_layer0[16] = {
-  -0.89131f, 0.09124f,  -0.71678f, -1.19929f, 0.98963f,  0.16896f,
-  -0.44943f, -0.97532f, -0.13997f, 1.07136f,  -0.46362f, -0.45253f,
-  -0.63015f, -0.20008f, 1.24048f,  -0.21265f,
+  0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f,  0.01143f,
+  0.00235f, 4.26772f, 0.44364f,  -0.33199f, -0.39076f, -0.35129f,
+  0.08288f, 0.18195f, -0.79890f, 0.10047f,
 };
 
 static const float av1_tx_type_nn_weights_4x8_ver_layer1[64] = {
-  -0.79795f, 0.45973f,  -0.54188f, -1.05095f, 0.64404f,  -0.56470f, -0.57018f,
-  0.61644f,  0.50229f,  1.14006f,  0.13805f,  -0.42058f, -0.07468f, 0.66203f,
-  0.93180f,  -0.59662f, -0.25152f, 0.00336f,  1.09769f,  -1.11921f, 0.15151f,
-  0.58750f,  -0.42480f, -0.95908f, -0.10980f, 1.31715f,  0.06665f,  -0.52371f,
-  0.37228f,  -0.12364f, 0.54876f,  -0.32698f, 0.39863f,  -0.97669f, -1.06351f,
-  1.82755f,  1.02851f,  0.10322f,  -0.08322f, 0.08891f,  -0.05715f, 0.93503f,
-  0.02096f,  -0.39506f, -0.99330f, -0.09407f, 0.75108f,  -0.30104f, 1.78314f,
-  -0.01786f, -0.17392f, 0.00461f,  0.41394f,  0.92566f,  1.11251f,  -0.71380f,
-  -0.04907f, 0.12736f,  0.00208f,  0.94451f,  -0.31783f, -0.19655f, 0.64619f,
-  0.50359f,
+  -0.38193f, -0.12095f, 1.57802f,  0.34932f,  -0.47333f, -0.12304f, -0.01736f,
+  -2.52445f, 0.18983f,  -0.64707f, -0.60889f, -0.53750f, 0.91666f,  -0.62823f,
+  -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f,  1.48589f,  -1.03238f,
+  -0.33459f, -0.35108f, -2.42417f, 0.60229f,  0.06824f,  -0.75495f, 0.26902f,
+  0.65311f,  -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f,  -0.59589f,
+  0.49738f,  -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f,  -0.66490f,
+  -0.76312f, 0.28256f,  1.06311f,  -0.38364f, -0.63508f, -0.57609f, -0.88765f,
+  -1.04403f, -0.46531f, 0.34084f,  -1.20498f, -0.68352f, -0.72251f, -2.63242f,
+  -0.68736f, -0.37904f, -1.32371f, 0.47288f,  1.51904f,  0.78372f,  -1.01830f,
+  -1.01848f,
 };
 
 static const float av1_tx_type_nn_bias_4x8_ver_layer1[4] = {
-  0.39274f,
-  1.27276f,
-  0.30322f,
-  2.55238f,
+  -1.45955f,
+  -2.08949f,
+  -1.24813f,
+  -1.55368f,
 };
 
 static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = {
@@ -166,64 +196,59 @@ static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = {
   {
       16,
   },  // num_hidden_nodes
-  {
-      av1_tx_type_nn_weights_4x8_ver_layer0,
-      av1_tx_type_nn_weights_4x8_ver_layer1,
-  },
-  {
-      av1_tx_type_nn_bias_4x8_ver_layer0,
-      av1_tx_type_nn_bias_4x8_ver_layer1,
-  },
+  { av1_tx_type_nn_weights_4x8_ver_layer0,
+    av1_tx_type_nn_weights_4x8_ver_layer1 },
+  { av1_tx_type_nn_bias_4x8_ver_layer0, av1_tx_type_nn_bias_4x8_ver_layer1 }
 };
 /******************************************************************************/
 
 // Tx type model for 8x4 block.
 static const float av1_tx_type_nn_weights_8x4_hor_layer0[128] = {
-  0.64828f,  0.61618f,  0.98975f,  -0.14562f, 0.26957f,  1.80872f,  0.58299f,
-  -0.06917f, 0.00937f,  -0.74073f, -0.66045f, -0.04576f, -0.39802f, -0.76960f,
-  -0.85166f, 0.88799f,  -0.70694f, -0.34366f, -0.54906f, -0.39502f, -0.29465f,
-  -0.49650f, -0.32171f, 1.37181f,  1.30432f,  0.71843f,  1.01916f,  1.01582f,
-  0.90999f,  0.86334f,  1.04603f,  0.40734f,  0.96187f,  0.53742f,  0.07510f,
-  0.44167f,  0.02049f,  -0.02874f, 0.97191f,  1.03647f,  -2.62751f, -0.01390f,
-  -0.09282f, -0.02522f, -0.30849f, -0.19386f, -0.51356f, 0.52778f,  0.77191f,
-  0.75416f,  0.69067f,  0.93561f,  1.35982f,  0.76193f,  0.57869f,  0.00251f,
-  -0.87244f, -0.26922f, -0.06682f, 0.07176f,  0.51142f,  0.58948f,  0.13914f,
-  0.71165f,  -0.40329f, -0.33201f, 0.35293f,  0.33437f,  -0.01812f, -0.24765f,
-  0.26810f,  -0.77088f, 1.35707f,  0.22243f,  0.78402f,  0.66191f,  0.79890f,
-  1.90669f,  0.73189f,  0.24222f,  -0.34682f, 0.66990f,  0.19554f,  0.58414f,
-  0.05060f,  -0.21271f, 0.11656f,  -0.74907f, 0.68837f,  -0.39147f, -1.78263f,
-  -0.69918f, -0.06838f, -0.26927f, 0.38502f,  0.08305f,  1.29848f,  0.67328f,
-  0.67269f,  0.65805f,  -0.47778f, -1.02617f, 0.16523f,  0.12223f,  -0.35294f,
-  -0.15866f, -0.56224f, 1.25895f,  -0.21422f, -0.33518f, -0.33519f, -0.37414f,
-  0.55122f,  0.14806f,  0.44312f,  -0.07865f, 0.75295f,  0.10766f,  0.59922f,
-  0.48837f,  -0.19099f, -2.07991f, 0.35755f,  0.87813f,  0.07559f,  1.00724f,
-  0.25223f,  -0.06761f,
+  -0.22492f, 0.13341f,  -4.03243f, -0.64015f, 0.02783f,  0.60466f,  -0.13335f,
+  0.16828f,  0.12336f,  0.52904f,  1.18455f,  -0.32425f, 0.13052f,  0.93810f,
+  -3.71165f, 0.02990f,  -4.63558f, 0.05666f,  0.03524f,  -0.07449f, -0.44006f,
+  -0.33215f, -0.33713f, 0.08097f,  0.60873f,  0.29582f,  0.21696f,  -0.78729f,
+  -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f,  1.58463f,  1.48536f,
+  1.54374f,  1.60069f,  1.46125f,  1.53932f,  0.05974f,  -1.82192f, 0.47043f,
+  0.38090f,  0.20833f,  -0.05637f, 0.05183f,  0.01323f,  -0.25662f, 0.78634f,
+  -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f,
+  -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f,
+  0.75079f,  2.32551f,  0.05878f,  0.80438f,  0.88584f,  0.69153f,  0.89060f,
+  0.73660f,  0.87259f,  -0.00745f, -1.30044f, -0.59430f, 2.07270f,  1.03307f,
+  -0.84697f, -1.19393f, 0.17549f,  -0.24978f, -3.67234f, 0.20781f,  -0.53946f,
+  -0.05068f, 0.88274f,  1.30371f,  0.10288f,  0.07585f,  0.12259f,  -0.30815f,
+  0.25437f,  -2.82096f, -2.69482f, 0.02370f,  0.12500f,  -0.21019f, -0.49220f,
+  0.03638f,  -0.29795f, 0.28645f,  -0.48432f, -0.38584f, -0.32148f, -0.47197f,
+  0.32437f,  0.32528f,  -0.19437f, 0.30383f,  -0.31879f, 0.26359f,  -0.12164f,
+  -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f,
+  -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f,
+  -1.85523f, 0.92532f,
 };
 
 static const float av1_tx_type_nn_bias_8x4_hor_layer0[16] = {
-  -0.54227f, 0.08599f,  -0.77447f, -1.10920f, 0.89298f,  0.05454f,
-  -0.73681f, 0.21048f,  -0.41041f, 1.25690f,  -0.60918f, 0.14661f,
-  -0.65392f, -0.25881f, 1.67995f,  -0.03550f,
+  0.36631f,  0.02901f,  0.64305f,  1.53074f, -1.40229f, 0.03852f,
+  -0.05043f, 0.89632f,  -1.23312f, 0.07036f, 0.17070f,  0.56250f,
+  -0.28958f, -0.32869f, -0.01704f, 0.68171f,
 };
 
 static const float av1_tx_type_nn_weights_8x4_hor_layer1[64] = {
-  -0.22312f, 0.73552f,  0.48399f,  -0.66996f, 0.36527f,  -0.42228f, -1.10793f,
-  0.31167f,  0.16177f,  1.69315f,  -0.06287f, -0.35804f, -0.24889f, 0.80824f,
-  1.08952f,  -0.62838f, 0.30066f,  -0.19043f, -0.00518f, -1.31005f, 0.65797f,
-  1.07714f,  -0.24253f, 0.49779f,  0.05848f,  1.08914f,  0.08015f,  -0.38853f,
-  0.35108f,  -0.11026f, 0.64528f,  -0.37615f, 0.39995f,  -0.58117f, -1.29627f,
-  1.74169f,  0.75558f,  -0.04910f, 0.35020f,  0.04556f,  0.12634f,  1.27223f,
-  0.02608f,  -0.19687f, -0.78649f, -0.22746f, 1.02589f,  -0.28411f, 1.42443f,
-  -0.42115f, -0.21153f, -0.01733f, 0.62001f,  0.87167f,  1.66008f,  -0.39179f,
-  -0.06293f, 0.27012f,  0.16871f,  0.64597f,  0.67358f,  -0.20053f, 0.95830f,
-  0.44232f,
+  -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f,  -0.73172f,
+  -0.69337f, 0.88807f,  -0.49242f, -0.44717f, -0.11436f, 0.09978f,  0.15393f,
+  0.17083f,  1.44850f,  -0.20582f, -0.04906f, 0.42990f,  -0.61939f, -1.09692f,
+  -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f,
+  0.15383f,  -0.04193f, -0.54858f, 1.82676f,  -0.22411f, 0.05264f,  -0.45848f,
+  -0.72985f, 0.87553f,  0.04116f,  -1.29774f, -2.63018f, 1.09089f,  -0.36048f,
+  -0.16725f, 0.11627f,  0.49918f,  0.07539f,  0.00763f,  0.73706f,  0.87800f,
+  0.57049f,  0.60969f,  1.02779f,  1.53339f,  -0.35915f, 0.06410f,  1.44582f,
+  0.09698f,  0.71888f,  0.60594f,  0.84103f,  -0.50440f, -0.38825f, 0.15626f,
+  -1.10654f,
 };
 
 static const float av1_tx_type_nn_bias_8x4_hor_layer1[4] = {
-  0.14889f,
-  1.74197f,
-  0.53696f,
-  2.87574f,
+  -0.92861f,
+  -1.45151f,
+  -1.33588f,
+  -4.33853f,
 };
 
 static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = {
@@ -233,42 +258,37 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = {
   {
       16,
   },  // num_hidden_nodes
-  {
-      av1_tx_type_nn_weights_8x4_hor_layer0,
-      av1_tx_type_nn_weights_8x4_hor_layer1,
-  },
-  {
-      av1_tx_type_nn_bias_8x4_hor_layer0,
-      av1_tx_type_nn_bias_8x4_hor_layer1,
-  },
+  { av1_tx_type_nn_weights_8x4_hor_layer0,
+    av1_tx_type_nn_weights_8x4_hor_layer1 },
+  { av1_tx_type_nn_bias_8x4_hor_layer0, av1_tx_type_nn_bias_8x4_hor_layer1 }
 };
 
 static const float av1_tx_type_nn_weights_8x4_ver_layer0[32] = {
-  0.81919f,  0.15527f,  0.60055f,  -0.54617f, -0.35510f, -0.28223f, -0.20478f,
-  0.15001f,  -1.84806f, -0.30274f, -0.00865f, 0.33939f,  1.11970f,  0.44630f,
-  0.32074f,  0.39637f,  0.08149f,  1.28070f,  0.86703f,  0.76503f,  -1.83991f,
-  -1.13575f, -0.68605f, -0.23690f, 0.07099f,  0.64960f,  0.82543f,  -0.72028f,
-  0.08220f,  0.34338f,  0.20245f,  -0.88920f,
+  -1.10946f, 1.86574f,  -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f,
+  -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f,
+  -1.36831f, 1.00374f,  2.59312f,  0.50291f, -0.71042f, -0.12238f, -0.15901f,
+  -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f,  2.28687f,
+  1.66212f,  1.70826f,  1.55182f,  0.12230f,
 };
 
 static const float av1_tx_type_nn_bias_8x4_ver_layer0[8] = {
-  1.14995f, -0.16021f, 2.38325f, -0.65179f,
-  1.09624f, 1.07662f,  0.63837f, -0.64847f,
+  0.10943f,  2.09789f, 2.16578f, 0.15766f,
+  -0.42461f, 0.00000f, 1.22090f, -1.28717f,
 };
 
 static const float av1_tx_type_nn_weights_8x4_ver_layer1[32] = {
-  0.10278f,  0.06819f,  1.73885f,  1.29889f,  -0.18482f, -1.06132f, 0.67003f,
-  -0.23280f, 0.50181f,  -0.33890f, 0.43524f,  -1.03147f, 1.09640f,  0.66332f,
-  0.47652f,  -0.02251f, 0.94245f,  -0.03861f, 0.84776f,  0.28377f,  0.92044f,
-  0.23572f,  0.52082f,  -0.16266f, 0.45290f,  0.11342f,  -0.50310f, -0.92633f,
-  1.46345f,  1.84714f,  1.06804f,  -0.13610f,
+  1.20426f,  -1.23237f, 2.41053f, -0.72488f, 1.25249f,  0.18018f,  -0.09586f,
+  2.17901f,  0.15364f,  1.21535f, -0.38263f, -0.74309f, 0.50551f,  -0.54208f,
+  0.59139f,  1.16095f,  0.55919f, -0.60183f, 1.18949f,  1.60787f,  0.54002f,
+  -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f,  -2.83483f, -0.27086f,
+  -1.15005f, -0.39311f, 1.51236f, -1.68973f,
 };
 
 static const float av1_tx_type_nn_bias_8x4_ver_layer1[4] = {
-  2.41028f,
-  1.95675f,
-  0.82387f,
-  2.41923f,
+  1.81013f,
+  1.10517f,
+  2.90059f,
+  0.95391f,
 };
 
 static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = {
@@ -278,131 +298,181 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = {
   {
       8,
   },  // num_hidden_nodes
-  {
-      av1_tx_type_nn_weights_8x4_ver_layer0,
-      av1_tx_type_nn_weights_8x4_ver_layer1,
-  },
-  {
-      av1_tx_type_nn_bias_8x4_ver_layer0,
-      av1_tx_type_nn_bias_8x4_ver_layer1,
-  },
+  { av1_tx_type_nn_weights_8x4_ver_layer0,
+    av1_tx_type_nn_weights_8x4_ver_layer1 },
+  { av1_tx_type_nn_bias_8x4_ver_layer0, av1_tx_type_nn_bias_8x4_ver_layer1 }
 };
 /******************************************************************************/
 
 // Tx type model for 8x8 block.
-static const float av1_tx_type_nn_weights_8x8_layer0[128] = {
-  0.98214f,  1.05643f,  0.91173f,  0.24165f,  0.39961f,  0.25736f,  0.68593f,
-  0.10553f,  0.13353f,  -0.49687f, -1.66413f, 1.16584f,  2.25147f,  -0.72247f,
-  -2.65486f, -0.03628f, -1.47746f, -1.07644f, -1.25551f, -0.91260f, -1.26199f,
-  -1.06022f, -1.42138f, 1.10500f,  2.96552f,  -0.40638f, 0.02258f,  -0.23137f,
-  0.34922f,  -0.01454f, 0.41251f,  0.35944f,  -1.56742f, 0.01406f,  0.88114f,
-  1.42462f,  0.87243f,  0.02439f,  0.07035f,  0.34303f,  -3.16843f, 0.25798f,
-  0.07494f,  0.38926f,  -0.12267f, 0.09049f,  -0.36711f, 0.01551f,  1.41269f,
-  1.33505f,  1.43627f,  1.41909f,  1.44605f,  1.43008f,  1.36721f,  0.19443f,
-  -0.08606f, 0.17285f,  0.63692f,  0.92092f,  0.61007f,  0.87100f,  -0.33631f,
-  1.98025f,  -0.40686f, -0.33808f, 0.34919f,  0.33817f,  -0.01807f, -0.25259f,
-  0.26442f,  -0.76979f, 1.07788f,  -1.38747f, 1.34315f,  2.79947f,  2.02838f,
-  -0.25062f, 0.00174f,  1.25888f,  0.17344f,  0.20897f,  1.28765f,  1.95749f,
-  1.62351f,  1.04556f,  0.43858f,  0.12463f,  1.66399f,  0.03971f,  0.36614f,
-  0.56932f,  0.15982f,  0.11587f,  0.21402f,  1.89386f,  -0.91267f, -0.79781f,
-  1.79155f,  0.60147f,  -0.90118f, -4.32718f, -0.58154f, -0.02181f, -0.40734f,
-  -0.11409f, -0.79470f, 0.69697f,  -0.16588f, -0.16090f, -0.21236f, -0.52776f,
-  -0.64455f, 0.09173f,  0.80766f,  0.76097f,  0.20295f,  -0.93467f, -0.43509f,
-  0.59659f,  0.07788f,  -3.79459f, 0.16268f,  0.47343f,  0.05106f,  -0.24880f,
-  1.18941f,  0.10346f,
-};
-
-static const float av1_tx_type_nn_bias_8x8_layer0[16] = {
-  0.75780f,  0.25628f,  0.19911f,  -0.41384f, 1.33909f,  0.31498f,
-  -1.37171f, -1.09561f, -0.44056f, 0.49001f,  -0.65804f, -1.96031f,
-  0.64806f,  -0.52520f, 1.38838f,  0.15519f,
-};
-
-static const float av1_tx_type_nn_weights_8x8_layer1[64] = {
-  -0.63856f, -2.02670f, -0.92947f, 0.00216f,  1.47710f,  -2.01099f, -2.11289f,
-  -0.92288f, 0.19296f,  1.37866f,  -0.85975f, -0.78624f, -2.10392f, 0.13976f,
-  1.06968f,  -2.04120f, 0.57991f,  -1.84941f, -0.81512f, -2.08254f, -0.47334f,
-  0.12256f,  -1.39594f, -1.02829f, 0.06134f,  2.25646f,  -1.25196f, -2.65317f,
-  -1.94473f, 0.10989f,  0.55446f,  -1.76557f, 0.33455f,  -1.85556f, -3.01878f,
-  -0.25100f, 1.65520f,  -1.61409f, 1.16336f,  -1.15560f, 0.13631f,  1.50733f,
-  -1.07538f, -0.91200f, -1.93132f, 0.09271f,  0.24425f,  -1.80655f, -0.01138f,
-  -1.36421f, -0.62970f, -0.84766f, -0.34714f, -0.50531f, 1.91005f,  -1.60316f,
-  -0.02495f, 1.04938f,  0.28411f,  -0.79809f, -1.48232f, 0.00766f,  0.94016f,
-  -1.11974f,
-};
-
-static const float av1_tx_type_nn_bias_8x8_layer1[4] = {
-  0.53574f,
-  1.57736f,
-  -0.13698f,
-  2.64613f,
-};
-
-static const NN_CONFIG av1_tx_type_nnconfig_8x8 = {
+static const float av1_tx_type_nn_weights_8x8_hor_layer0[128] = {
+  -0.85529f, 0.37619f,  0.12754f,  0.08622f,  0.45278f,  0.54929f,  1.60651f,
+  -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f,  0.31695f,  -0.05616f,
+  0.20483f,  -0.36448f, 2.27203f,  -0.33087f, 0.47679f,  0.86888f,  0.39370f,
+  0.46239f,  0.01113f,  1.50327f,  -1.48226f, -1.69621f, -1.49777f, -1.38885f,
+  -1.37753f, -1.22681f, -1.70576f, 0.51329f,  -1.65662f, 1.74197f,  -0.13579f,
+  -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f,  -0.56491f,
+  -0.83432f, 0.13492f,  1.32147f,  2.85285f,  0.13819f,  0.03792f,  -1.30792f,
+  0.04155f,  -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f,
+  0.29540f,  0.01137f,  -0.25335f, -0.16856f, 0.12028f,  0.05207f,  0.39357f,
+  -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f,
+  2.08283f,  0.19291f,  -4.81426f, -0.65044f, -0.24598f, 0.06371f,  -0.10272f,
+  -0.14502f, -0.06821f, 0.45202f,  0.21091f,  -0.80864f, 0.39255f,  1.79189f,
+  1.80453f,  1.10484f,  1.17608f,  0.96901f,  -0.35871f, -0.94311f, 0.63147f,
+  2.95157f,  0.45917f,  -0.42849f, -0.55643f, -0.06097f, 3.49299f,  -0.50972f,
+  0.11075f,  -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f,  -1.61074f,
+  1.82998f,  0.37623f,  -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f,
+  1.38846f,  1.42085f,  1.42568f,  1.36152f,  1.46910f,  1.27473f,  1.34752f,
+  0.12753f,  -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f,
+  -0.99892f, 1.09823f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_hor_layer0[16] = {
+  -0.49232f, -0.29685f, -1.44020f, 1.10940f,  1.16452f, -0.34862f,
+  -0.38761f, -0.36243f, 0.21776f,  0.28234f,  2.34269f, -0.04104f,
+  -0.26319f, 2.65579f,  -1.30137f, -0.01487f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_hor_layer1[64] = {
+  -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f,  -1.42322f,
+  -0.29106f, 0.07228f,  0.04391f,  1.61388f,  -0.03055f, 0.81637f,  2.06045f,
+  0.27119f,  -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f,
+  0.60958f,  -1.30523f, 0.25143f,  0.11398f,  0.37860f,  1.54829f,  0.02309f,
+  0.67288f,  2.11447f,  0.44845f,  -0.70406f, -0.67897f, -0.38759f, -1.30383f,
+  -1.22646f, -1.54571f, 0.60552f,  -1.52565f, 0.11469f,  0.17344f,  0.08622f,
+  1.57906f,  -0.00909f, 0.81634f,  2.04909f,  1.26466f,  -1.45741f, -0.75229f,
+  0.06200f,  -1.05835f, -0.66257f, -1.73766f, 0.99923f,  -1.87082f, 0.14580f,
+  0.49525f,  0.46839f,  1.32203f,  0.33923f,  0.97001f,  2.38584f,  1.58811f,
+  0.06161f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_hor_layer1[4] = {
+  1.70385f,
+  1.82373f,
+  1.78496f,
+  1.80826f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8_hor = {
   8,  // num_inputs
   4,  // num_outputs
   1,  // num_hidden_layers
   {
       16,
   },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x8_hor_layer0,
+    av1_tx_type_nn_weights_8x8_hor_layer1 },
+  { av1_tx_type_nn_bias_8x8_hor_layer0, av1_tx_type_nn_bias_8x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x8_ver_layer0[128] = {
+  -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f,
+  2.09681f,  -0.05081f, -0.61030f, 2.02541f,  0.60222f,  0.99936f,  2.02114f,
+  -0.53893f, -0.23757f, 0.73566f,  0.25443f,  0.00132f,  -0.74036f, -0.75351f,
+  -0.76964f, -1.71007f, -0.15770f, 1.60982f,  2.17638f,  0.90681f,  0.64973f,
+  0.85914f,  0.58786f,  -1.46228f, 0.05187f,  1.18804f,  0.30850f,  0.29512f,
+  0.40526f,  0.37635f,  0.32311f,  0.37471f,  1.12346f,  3.41856f,  -0.36653f,
+  0.42537f,  -0.19240f, 0.00155f,  0.30826f,  -0.02116f, -0.53435f, -0.34829f,
+  -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f,
+  -0.75257f, 0.10057f,  1.43474f,  0.89450f,  0.75900f,  1.11147f,  1.00558f,
+  0.25886f,  2.22095f,  -0.17926f, 0.57161f,  0.39546f,  0.47846f,  0.40452f,
+  0.54298f,  0.45814f,  -3.62788f, -3.02374f, 0.03716f,  -0.13937f, -0.09415f,
+  -0.12463f, 0.05682f,  0.03672f,  1.20746f,  1.25003f,  1.27071f,  1.31883f,
+  1.27473f,  1.34943f,  1.23158f,  0.09039f,  0.19388f,  0.63420f,  2.79612f,
+  0.93803f,  -0.11323f, -0.02027f, 0.41286f,  -0.05979f, -3.80705f, -0.52451f,
+  -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f,  0.05346f,
+  0.61403f,  0.32140f,  -2.39831f, -1.42355f, 1.30541f,  1.02361f,  0.12930f,
+  -1.61469f, -0.77036f, -0.59144f, 1.27769f,  1.52068f,  0.82137f,  1.83159f,
+  -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f,
+  -1.29848f, 0.39308f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_ver_layer0[16] = {
+  -0.14868f, -0.48343f, 3.94416f,  -0.78037f, -1.33789f, -0.60611f,
+  0.51793f,  0.44030f,  -0.71563f, 0.22561f,  -1.19083f, -0.46149f,
+  0.83015f,  0.06024f,  1.17180f,  0.65122f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_ver_layer1[64] = {
+  -1.42711f, -0.21683f, 2.12061f,  0.20489f,  -0.50228f, -0.24770f, 0.23391f,
+  1.03470f,  -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f,
+  1.43322f,  0.00280f,  -1.53057f, -0.18912f, 1.95333f,  0.31151f,  -2.07601f,
+  0.06776f,  0.25529f,  0.94800f,  -1.11453f, -0.20594f, -0.13281f, 0.01485f,
+  0.17650f,  -0.07955f, 1.43734f,  -0.23193f, -2.06463f, -0.21238f, 2.13707f,
+  0.30351f,  0.27594f,  -0.36245f, 0.19539f,  0.91045f,  -0.24068f, -0.37616f,
+  0.88792f,  0.02947f,  -0.16903f, -0.04932f, 1.51293f,  -0.95967f, -1.62903f,
+  0.05326f,  2.30703f,  0.64445f,  -1.09464f, -0.16623f, 1.00240f,  0.07548f,
+  -0.50406f, 0.63854f,  1.02340f,  0.49833f,  0.13671f,  0.26722f,  2.09516f,
+  -0.41305f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_ver_layer1[4] = {
+  2.14067f,
+  2.76699f,
+  2.04233f,
+  1.34803f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
   {
-      av1_tx_type_nn_weights_8x8_layer0,
-      av1_tx_type_nn_weights_8x8_layer1,
-  },
-  {
-      av1_tx_type_nn_bias_8x8_layer0,
-      av1_tx_type_nn_bias_8x8_layer1,
-  },
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x8_ver_layer0,
+    av1_tx_type_nn_weights_8x8_ver_layer1 },
+  { av1_tx_type_nn_bias_8x8_ver_layer0, av1_tx_type_nn_bias_8x8_ver_layer1 }
 };
 /******************************************************************************/
 
 // Tx type model for 8x16 block.
 static const float av1_tx_type_nn_weights_8x16_hor_layer0[128] = {
-  1.36274f,  1.37313f,  1.26859f,  1.26459f,  1.37979f,  1.47217f,  1.29710f,
-  0.15765f,  0.31552f,  -0.05727f, 0.25562f,  0.47925f,  -0.32913f, -0.55757f,
-  -0.98010f, 0.08568f,  -0.62754f, 0.12834f,  -0.03717f, 0.06286f,  0.26159f,
-  0.26023f,  -0.62605f, 1.34500f,  1.47720f,  0.47937f,  0.84793f,  0.87866f,
-  0.81260f,  0.74761f,  0.84217f,  0.53321f,  -0.78232f, 0.35321f,  0.41240f,
-  0.45002f,  0.88973f,  0.51055f,  0.91115f,  -0.45512f, -2.37418f, -0.25205f,
-  0.05893f,  -0.15685f, -0.25156f, -0.17104f, -0.12230f, 0.17802f,  0.18796f,
-  -0.05797f, 0.26484f,  1.23515f,  1.70393f,  0.46022f,  -0.14354f, 0.08501f,
-  -0.84625f, -0.42578f, -0.29345f, -0.51797f, -0.56515f, -0.47465f, 0.23970f,
-  1.59912f,  -0.40332f, -0.33209f, 0.37274f,  0.36831f,  -0.00248f, -0.24295f,
-  0.29539f,  -0.76136f, -0.22531f, 0.12371f,  0.37889f,  1.02639f,  1.73330f,
-  1.09686f,  1.04111f,  0.69006f,  -1.27157f, 0.94013f,  0.61621f,  0.62274f,
-  0.48759f,  0.55672f,  0.62597f,  -0.38846f, 1.72124f,  0.08214f,  -0.06650f,
-  0.32617f,  0.10958f,  0.24650f,  0.10740f,  1.16861f,  0.50701f,  0.45383f,
-  0.90016f,  -0.00695f, -0.11986f, -0.07834f, 0.20346f,  0.25863f,  -0.40889f,
-  -0.11344f, -0.79108f, 0.76259f,  -0.14562f, -0.15459f, -0.20954f, -0.51306f,
-  0.02743f,  -0.82456f, -0.00861f, -0.27274f, 0.28762f,  0.07282f,  0.26410f,
-  0.53413f,  -0.22208f, -0.85031f, -1.39129f, -0.74519f, 0.09771f,  0.80313f,
-  1.07698f,  0.02531f,
+  -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f,
+  0.73431f,  1.10135f,  0.47054f,  0.43230f,  -0.43009f, -0.09135f, -0.07289f,
+  -0.38785f, 1.23775f,  -0.35312f, 0.73789f,  0.88864f,  0.75957f,  0.62579f,
+  0.46974f,  0.21851f,  1.63821f,  -2.27289f, -0.68522f, -0.69814f, -0.84368f,
+  -0.91320f, -0.63055f, -1.03296f, 0.55778f,  -0.00071f, 1.27539f,  1.60068f,
+  1.40975f,  0.97372f,  0.92843f,  1.90853f,  0.12626f,  1.71953f,  1.41978f,
+  -0.12234f, -1.27058f, 0.76207f,  0.02495f,  -0.67038f, -0.05255f, 1.72923f,
+  1.47630f,  1.47058f,  1.47614f,  1.49354f,  1.66131f,  1.50801f,  0.17145f,
+  -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f,  1.26572f,  0.97865f,
+  -0.65466f, 1.31129f,  0.26916f,  0.12139f,  -0.12761f, -0.39143f, -0.28134f,
+  0.06584f,  2.24418f,  0.22516f,  0.05011f,  -0.01671f, -0.29476f, -0.40326f,
+  0.21138f,  -0.11573f, -0.31154f, -0.36828f, 0.03694f,  -0.07172f, -0.63419f,
+  -3.14351f, -1.23125f, 0.65311f,  -0.11406f, 1.97287f,  -0.10422f, 0.83896f,
+  0.85033f,  0.49724f,  0.80482f,  0.51454f,  1.06447f,  0.76693f,  0.72599f,
+  -0.78573f, -0.53950f, 0.40894f,  0.00086f,  0.10784f,  -0.70498f, 1.16395f,
+  1.14597f,  1.13496f,  1.12177f,  1.02100f,  -1.37574f, -2.97144f, 0.33899f,
+  0.42013f,  0.86327f,  2.31983f,  2.04008f,  0.95503f,  0.15081f,  0.11530f,
+  -0.02574f, -4.77119f, 0.13257f,  -0.01704f, -0.23087f, -0.00825f, 0.07029f,
+  -0.28136f, 0.42556f,
 };
 
 static const float av1_tx_type_nn_bias_8x16_hor_layer0[16] = {
-  -1.30434f, -1.19259f, -0.43467f, -0.85386f, 0.96584f,  0.29276f,
-  -0.41990f, -0.96924f, -0.30933f, 0.95264f,  -0.25330f, -1.19584f,
-  1.46564f,  -0.42959f, 1.55720f,  0.18479f,
+  0.93617f,  -0.24000f, -1.26821f, 0.78780f,  0.13690f, -0.21948f,
+  -1.45162f, 0.44584f,  -1.92582f, -0.23169f, 0.56004f, -1.19937f,
+  1.81560f,  -1.02643f, -0.81690f, 0.08302f,
 };
 
 static const float av1_tx_type_nn_weights_8x16_hor_layer1[64] = {
-  -1.72959f, -0.21670f, 0.10616f,  -0.02006f, 0.15084f,  -0.85303f, -0.27535f,
-  0.58704f,  0.23683f,  1.19743f,  0.77971f,  0.49874f,  0.19508f,  0.19641f,
-  1.47895f,  -0.52173f, -0.56746f, -0.50761f, 0.15864f,  -0.95168f, 0.48103f,
-  0.91904f,  -0.11700f, 0.62863f,  0.06526f,  1.63803f,  -0.72325f, -1.80449f,
-  0.66373f,  0.12831f,  0.27139f,  -0.26346f, 1.50852f,  0.25079f,  -0.54255f,
-  1.78815f,  1.39691f,  -0.44989f, -0.18511f, -1.52903f, 0.13983f,  1.06906f,
-  -0.30184f, 0.37566f,  0.46209f,  0.10440f,  0.64695f,  -0.34002f, 1.96990f,
-  0.21189f,  -0.91248f, -0.11263f, 0.26708f,  1.27405f,  1.89776f,  0.02081f,
-  -0.06977f, -0.02584f, 0.47733f,  0.27117f,  1.33315f,  -0.09175f, 0.48747f,
-  1.16772f,
+  0.06696f,  -0.11538f, -1.42029f, 0.32965f,  0.81046f,  0.01146f,  1.20945f,
+  -0.16899f, 0.53224f,  -0.40232f, 0.01786f,  -0.73242f, 1.29750f,  1.95185f,
+  0.70143f,  1.43287f,  0.76220f,  0.79937f,  -1.79011f, -1.15178f, 0.42526f,
+  -0.67519f, 0.77267f,  -0.30697f, 2.46004f,  -0.49828f, 0.02875f,  1.09972f,
+  1.47662f,  0.61719f,  0.61417f,  -0.12363f, 2.53048f,  0.00418f,  -1.38964f,
+  0.88117f,  0.39239f,  -0.19347f, -2.58600f, -0.33715f, 1.09323f,  -0.32127f,
+  0.02456f,  -0.19125f, 1.12728f,  0.66502f,  0.34296f,  1.14897f,  0.29967f,
+  1.19209f,  0.22108f,  -0.11975f, 1.49776f,  -1.34624f, -2.58478f, -1.34632f,
+  1.53207f,  0.45634f,  -1.48476f, 0.17489f,  0.71790f,  -2.12086f, -1.21778f,
+  -1.31243f,
 };
 
 static const float av1_tx_type_nn_bias_8x16_hor_layer1[4] = {
-  1.25783f,
-  1.19452f,
-  0.69964f,
-  2.41982f,
+  0.83359f,
+  1.06875f,
+  1.77645f,
+  1.49570f,
 };
 
 static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = {
@@ -412,62 +482,57 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = {
   {
       16,
   },  // num_hidden_nodes
-  {
-      av1_tx_type_nn_weights_8x16_hor_layer0,
-      av1_tx_type_nn_weights_8x16_hor_layer1,
-  },
-  {
-      av1_tx_type_nn_bias_8x16_hor_layer0,
-      av1_tx_type_nn_bias_8x16_hor_layer1,
-  },
+  { av1_tx_type_nn_weights_8x16_hor_layer0,
+    av1_tx_type_nn_weights_8x16_hor_layer1 },
+  { av1_tx_type_nn_bias_8x16_hor_layer0, av1_tx_type_nn_bias_8x16_hor_layer1 }
 };
 
 static const float av1_tx_type_nn_weights_8x16_ver_layer0[128] = {
-  0.90888f,  0.86305f,  0.81674f,  0.75352f,  1.07834f,  0.99048f,  0.96355f,
-  0.13836f,  -0.51334f, 0.19906f,  1.84608f,  0.67828f,  0.45876f,  0.08325f,
-  0.28190f,  -0.01958f, -1.96553f, 0.27837f,  -0.05929f, 0.13491f,  0.21036f,
-  0.05797f,  -0.01373f, 0.73765f,  1.39603f,  -0.53767f, 0.10362f,  0.03420f,
-  0.41909f,  0.09510f,  0.32284f,  0.83860f,  0.13954f,  0.48434f,  1.47762f,
-  0.45891f,  0.23613f,  0.13013f,  0.82097f,  -0.03251f, -1.89757f, 0.21589f,
-  -0.10370f, 0.02530f,  -0.25659f, 0.01466f,  -0.23661f, 0.22783f,  0.92100f,
-  1.02915f,  1.20358f,  1.17251f,  0.97749f,  1.04696f,  0.91333f,  0.54576f,
-  -0.52792f, 0.02217f,  0.25652f,  0.31405f,  -0.18398f, 0.04572f,  -0.81359f,
-  1.82883f,  -0.40047f, -0.33056f, 0.35255f,  0.34448f,  -0.00339f, -0.23857f,
-  0.28925f,  -0.77175f, -0.24325f, -0.21420f, 1.11451f,  1.39553f,  0.51573f,
-  0.05476f,  1.13791f,  0.94959f,  -0.35710f, 0.67467f,  0.16722f,  0.61213f,
-  0.07683f,  -0.20613f, 0.13440f,  -0.72131f, -0.15418f, -0.17688f, -0.16510f,
-  -0.19226f, 0.09270f,  -2.43559f, -0.12669f, 0.05074f,  0.30414f,  0.00927f,
-  0.60630f,  0.00801f,  -1.07310f, -0.06227f, 2.10607f,  0.02382f,  -0.39891f,
-  -0.09149f, -0.78596f, 0.83966f,  -0.14802f, -0.14083f, -0.20831f, -0.55136f,
-  0.08566f,  -0.00647f, 0.07044f,  0.53408f,  0.85720f,  -0.07393f, 0.24476f,
-  0.43767f,  0.30519f,  -1.89430f, 0.23252f,  1.63790f,  0.17316f,  -0.03903f,
-  0.25269f,  0.01562f,
+  0.32858f,  -1.28887f, 0.25632f,  -0.05262f, 2.69203f,  -0.07004f, 1.37337f,
+  -0.05725f, -0.05659f, 0.05592f,  0.01039f,  -0.29343f, 1.58628f,  -0.30003f,
+  -3.43118f, 0.00272f,  1.70928f,  -0.76348f, 0.05889f,  -0.03263f, -0.07724f,
+  0.03523f,  -0.19890f, 1.18005f,  -0.03605f, -0.20530f, -4.00733f, 0.10210f,
+  -0.05368f, -0.17650f, -0.15317f, 0.06499f,  0.56705f,  1.04341f,  0.62890f,
+  0.73451f,  -0.22199f, 0.86659f,  0.78443f,  -0.61664f, -0.50606f, 0.30247f,
+  0.14455f,  0.39276f,  0.49203f,  0.65019f,  0.12269f,  1.64080f,  1.68289f,
+  1.42694f,  1.60825f,  1.58501f,  1.47252f,  1.62589f,  1.48218f,  0.17726f,
+  -0.04884f, 0.35376f,  -0.04796f, 0.32589f,  0.35087f,  0.35258f,  -0.46103f,
+  -0.31176f, -0.05203f, 0.07247f,  -0.26756f, 0.22019f,  0.03412f,  0.33773f,
+  0.29811f,  -0.11140f, 0.12831f,  -0.44673f, -0.09858f, 0.07889f,  0.15137f,
+  0.00347f,  -0.23394f, 0.08886f,  -0.31201f, -0.79912f, -0.51092f, 0.14123f,
+  -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f,
+  -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f,  -0.30681f, 0.04494f,
+  -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f,  -0.41224f,
+  -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f,
+  -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f,
+  0.36381f,  -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f,
+  -0.12236f, 0.16075f,
 };
 
 static const float av1_tx_type_nn_bias_8x16_ver_layer0[16] = {
-  -0.83370f, -0.20704f, -0.60437f, -0.81664f, 1.16998f,  0.16745f,
-  -1.34680f, -1.07083f, -0.34649f, 0.65598f,  -0.56278f, 0.22660f,
-  -0.25956f, -0.29608f, 1.24359f,  -0.09167f,
+  -0.35385f, 0.30491f,  -0.90011f, 0.42941f,  1.20928f, -0.88331f,
+  -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f,
+  0.57598f,  0.99819f,  0.75175f,  0.17044f,
 };
 
 static const float av1_tx_type_nn_weights_8x16_ver_layer1[64] = {
-  -0.71147f, -0.63964f, -0.69220f, 0.22326f,  0.67191f,  -0.58894f, -0.98464f,
-  0.23583f,  0.22824f,  1.39838f,  0.09920f,  -0.59411f, -0.67101f, 0.19088f,
-  0.83025f,  -0.66991f, -0.42889f, -0.49969f, 1.39532f,  -1.02000f, 0.62101f,
-  0.57175f,  -0.83226f, 0.01551f,  0.05604f,  1.23028f,  0.02030f,  -0.55995f,
-  -0.42349f, 0.15375f,  0.52132f,  -0.52421f, 0.89586f,  -0.73778f, -0.10911f,
-  0.22447f,  1.16858f,  -0.48169f, 1.73890f,  -0.69860f, 0.12504f,  1.10492f,
-  0.04391f,  -0.85670f, -0.49257f, 0.09616f,  0.76518f,  -0.44854f, 1.50938f,
-  0.62246f,  -0.40366f, -0.11182f, -0.01680f, 0.59724f,  1.32170f,  -1.09061f,
-  -0.04278f, -0.02449f, 0.25024f,  1.26239f,  0.42345f,  -0.10031f, 0.80871f,
-  0.44198f,
+  -0.62913f, -0.34304f, 0.42963f,  -0.17440f, -1.44092f, 0.69142f,  -1.36067f,
+  0.52211f,  0.44658f,  -0.26501f, -0.41657f, 0.34428f,  -0.34390f, -0.58567f,
+  -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f,
+  0.58755f,  -1.30559f, 0.39551f,  0.41743f,  -0.09940f, -0.33230f, 0.14458f,
+  -0.25139f, -0.54517f, 0.13469f,  -0.38157f, -0.39109f, -0.18205f, 0.06834f,
+  -0.08395f, -0.92187f, 0.56724f,  1.44381f,  0.53226f,  -0.22356f, 0.12285f,
+  -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f,
+  0.62641f,  -0.11823f, 1.00395f,  1.64794f,  -0.64535f, 2.29322f,  -0.23397f,
+  0.17251f,  -0.35927f, 0.65631f,  -0.26812f, 0.80128f,  0.85748f,  0.47404f,
+  2.20547f,
 };
 
 static const float av1_tx_type_nn_bias_8x16_ver_layer1[4] = {
-  0.68329f,
-  1.33555f,
-  0.25943f,
-  3.23439f,
+  -0.44080f,
+  -1.67455f,
+  -1.46332f,
+  -6.13206f,
 };
 
 static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = {
@@ -477,64 +542,59 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = {
   {
       16,
   },  // num_hidden_nodes
-  {
-      av1_tx_type_nn_weights_8x16_ver_layer0,
-      av1_tx_type_nn_weights_8x16_ver_layer1,
-  },
-  {
-      av1_tx_type_nn_bias_8x16_ver_layer0,
-      av1_tx_type_nn_bias_8x16_ver_layer1,
-  },
+  { av1_tx_type_nn_weights_8x16_ver_layer0,
+    av1_tx_type_nn_weights_8x16_ver_layer1 },
+  { av1_tx_type_nn_bias_8x16_ver_layer0, av1_tx_type_nn_bias_8x16_ver_layer1 }
 };
 /******************************************************************************/
 
 // Tx type model for 16x8 block.
 static const float av1_tx_type_nn_weights_16x8_hor_layer0[128] = {
-  0.89821f,  0.90804f,  1.13052f,  0.74855f,  1.02053f,  0.91260f,  0.97102f,
-  0.16808f,  -0.19982f, -0.33296f, -0.22490f, -0.22481f, -0.09332f, -2.44338f,
-  -0.12236f, -0.03158f, -1.43561f, 0.07794f,  0.16586f,  0.09731f,  0.12967f,
-  0.09725f,  -0.16826f, 1.26640f,  0.88004f,  0.27312f,  -0.07993f, 0.33640f,
-  0.11732f,  0.33384f,  0.97066f,  -0.61744f, -0.48545f, 0.44622f,  0.73744f,
-  0.32262f,  -0.05713f, 0.42280f,  1.10378f,  0.18540f,  -2.07906f, 0.11443f,
-  0.37877f,  0.24136f,  -0.12524f, -0.12434f, 0.02116f,  0.11716f,  1.28267f,
-  1.01508f,  1.26184f,  1.22545f,  1.29582f,  1.18855f,  1.27564f,  0.42001f,
-  -0.41481f, 0.06725f,  -0.13133f, -0.24801f, 0.16515f,  0.16228f,  0.35197f,
-  0.53610f,  -0.39805f, -0.32584f, 0.40096f,  0.38621f,  -0.00030f, -0.23434f,
-  0.29149f,  -0.76542f, 0.04996f,  -0.30036f, 1.48687f,  0.90852f,  -0.03083f,
-  -0.15953f, 1.19259f,  0.87690f,  -1.08977f, 0.78757f,  0.81149f,  0.54089f,
-  0.35400f,  0.37919f,  0.84997f,  -0.20449f, 0.39601f,  -0.37596f, 0.64748f,
-  0.26021f,  0.37354f,  0.23593f,  0.16335f,  1.70681f,  0.31800f,  -0.00964f,
-  0.82687f,  -0.78372f, -1.47438f, 0.32410f,  1.37436f,  0.07476f,  -0.40574f,
-  -0.10353f, -0.79300f, 0.74381f,  -0.15601f, -0.14380f, -0.20961f, -0.52697f,
-  0.04669f,  -0.00870f, 0.05624f,  -0.09036f, 0.25701f,  0.30336f,  0.24199f,
-  0.45579f,  0.66330f,  -1.81834f, 0.74965f,  1.22747f,  0.25072f,  0.25100f,
-  0.43289f,  -0.00362f,
+  0.02600f,  0.09786f,  -1.05107f, -0.35594f, -0.15658f, 2.99828f,  -0.07106f,
+  -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f,  1.91727f,  -0.00956f,
+  -0.90640f, 0.09174f,  1.58895f,  1.38945f,  1.49431f,  1.51381f,  1.44803f,
+  1.53544f,  1.44694f,  0.17753f,  1.69735f,  -0.78652f, 0.31092f,  -0.23736f,
+  0.02231f,  -0.09884f, -0.00493f, 1.21189f,  -1.94382f, -0.34629f, -0.58309f,
+  0.72291f,  -0.30056f, 0.90660f,  -0.57495f, 3.07809f,  0.73644f,  1.43050f,
+  1.34356f,  -0.66554f, 0.50102f,  -0.64305f, 0.42044f,  -1.66165f, -0.05733f,
+  -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f,  -0.07290f,
+  -0.26290f, -0.68941f, 1.81156f,  0.66125f,  -2.09974f, 0.17032f,  -0.67461f,
+  -0.00876f, -1.50154f, 1.17153f,  1.00377f,  0.33022f,  0.74689f,  0.42878f,
+  0.61725f,  -0.83967f, 0.09467f,  -0.39892f, 0.33863f,  0.10656f,  -0.09249f,
+  -0.39757f, 0.48481f,  -0.35162f, 1.47014f,  1.67827f,  -1.84051f, 0.16291f,
+  -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f,  -0.14743f, -0.02763f,
+  -0.28003f, -0.01364f, 0.21014f,  -0.29026f, -0.20198f, 1.38782f,  0.56731f,
+  0.27489f,  0.43227f,  0.41326f,  0.42721f,  0.87720f,  -1.90067f, -5.04951f,
+  -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f,
+  -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f,
+  0.28689f,  0.32394f,  0.52128f,  0.01013f,  -0.28948f, -0.26293f, -0.44331f,
+  -0.36570f, -0.50757f,
 };
 
 static const float av1_tx_type_nn_bias_16x8_hor_layer0[16] = {
-  -0.87643f, 0.36754f,  -0.86409f, 1.37761f,  1.22688f,  0.09074f,
-  -1.47139f, -1.06100f, -0.24087f, 1.10382f,  -0.32837f, -1.39592f,
-  -0.14741f, -0.43954f, 1.72137f,  -0.21704f,
+  -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f,
+  0.45260f,  0.16229f,  4.01393f,  -0.21748f, 0.36411f,  -0.08764f,
+  -0.12329f, 0.08986f,  1.08117f,  -0.00220f,
 };
 
 static const float av1_tx_type_nn_weights_16x8_hor_layer1[64] = {
-  -0.81860f, -0.80745f, -0.43612f, 0.58656f,  0.37455f, -0.56519f, -1.71536f,
-  0.23278f,  0.23951f,  1.09610f,  0.49986f,  0.43375f, -0.53182f, 0.17376f,
-  1.05626f,  -0.61743f, -0.71777f, -0.66943f, 1.40091f, 0.34426f,  1.14295f,
-  0.45571f,  -0.52504f, -0.00303f, 0.06044f,  0.66119f, -0.60340f, -1.14344f,
-  -0.28045f, 0.12742f,  0.61484f,  -0.41016f, 1.36102f, -0.86969f, -0.52728f,
-  1.01725f,  0.67083f,  -0.10138f, 1.36406f,  0.34066f, 0.12498f,  0.86595f,
-  -0.39636f, -0.27888f, -0.40244f, 0.09847f,  0.81178f, -0.45313f, 1.39127f,
-  0.99865f,  -0.57908f, 0.55072f,  0.49638f,  1.11524f, 1.85504f,  -0.28316f,
-  -0.05195f, -0.23284f, 0.26461f,  -1.28120f, 0.60707f, -0.06110f, 0.74085f,
-  0.63304f,
+  0.55824f,  -0.14648f, 0.81947f,  -0.45867f, -1.86078f, -0.17291f, 0.34849f,
+  0.15153f,  1.75625f,  -0.25760f, 0.72015f,  -0.30059f, -0.57975f, 0.07609f,
+  -0.02036f, 0.07912f,  0.57080f,  -0.13792f, 0.74184f,  -0.87669f, -1.87572f,
+  -0.27270f, 0.39751f,  0.19652f,  2.03514f,  -0.32944f, 0.76251f,  0.04399f,
+  -0.63175f, 0.37420f,  0.08309f,  0.04466f,  0.60255f,  -0.12820f, 1.66065f,
+  -0.59496f, -1.94794f, -0.14847f, 0.39424f,  0.16273f,  1.80587f,  0.41197f,
+  0.74691f,  -0.21217f, -0.63173f, 0.09510f,  -0.35538f, -0.04407f, 0.92847f,
+  0.20141f,  1.68680f,  -0.56528f, -2.26960f, 0.12978f,  0.73748f,  0.42438f,
+  2.00673f,  -0.40189f, 0.95423f,  0.23234f,  -0.80953f, 0.65814f,  0.49444f,
+  -0.23347f,
 };
 
 static const float av1_tx_type_nn_bias_16x8_hor_layer1[4] = {
-  0.71765f,
-  1.40400f,
-  0.32221f,
-  3.07234f,
+  3.57175f,
+  2.42612f,
+  3.31259f,
+  2.08287f,
 };
 
 static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = {
@@ -544,62 +604,57 @@ static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = {
   {
       16,
   },  // num_hidden_nodes
-  {
-      av1_tx_type_nn_weights_16x8_hor_layer0,
-      av1_tx_type_nn_weights_16x8_hor_layer1,
-  },
-  {
-      av1_tx_type_nn_bias_16x8_hor_layer0,
-      av1_tx_type_nn_bias_16x8_hor_layer1,
-  },
+  { av1_tx_type_nn_weights_16x8_hor_layer0,
+    av1_tx_type_nn_weights_16x8_hor_layer1 },
+  { av1_tx_type_nn_bias_16x8_hor_layer0, av1_tx_type_nn_bias_16x8_hor_layer1 }
 };
 
 static const float av1_tx_type_nn_weights_16x8_ver_layer0[128] = {
-  1.20497f,  1.23691f,  1.23738f,  1.07773f,  1.15264f,  1.31959f,  1.15365f,
-  0.17179f,  0.68612f,  0.55636f,  0.57145f,  0.67022f,  0.19636f,  -1.27420f,
-  -1.36428f, -0.16706f, -1.20934f, -0.87794f, -0.97146f, -0.74722f, -1.14493f,
-  -1.02689f, -0.88153f, 0.83857f,  1.53355f,  0.13601f,  0.35451f,  0.53750f,
-  0.62381f,  0.32438f,  0.59405f,  0.33090f,  -1.52948f, -0.46094f, 0.42634f,
-  0.48763f,  0.30707f,  0.52553f,  0.71427f,  -0.31287f, -2.37106f, -0.18756f,
-  0.16561f,  -0.00431f, -0.13747f, -0.09336f, -0.16511f, 0.13454f,  0.45010f,
-  -0.00317f, -0.06403f, 0.95442f,  1.59636f,  0.30602f,  -0.05515f, 0.05467f,
-  -0.21758f, -0.19192f, -0.17935f, -0.00545f, 0.35409f,  0.26141f,  -0.32174f,
-  1.78129f,  -0.40161f, -0.33158f, 0.38084f,  0.38081f,  0.01053f,  -0.23567f,
-  0.29239f,  -0.76159f, -0.19373f, 0.13649f,  0.66949f,  1.19733f,  1.92557f,
-  1.16691f,  0.94955f,  0.62324f,  -0.85434f, -0.07699f, 0.87683f,  0.95911f,
-  0.86106f,  0.57959f,  0.40146f,  -0.35851f, 1.55427f,  0.15349f,  -0.01582f,
-  0.32517f,  0.03784f,  0.15916f,  0.09024f,  1.43187f,  0.56160f,  0.11521f,
-  0.52476f,  -0.26107f, -0.38167f, -0.31596f, 0.31304f,  -0.65366f, -0.40680f,
-  -0.11082f, -0.78585f, 0.77906f,  -0.13322f, -0.13747f, -0.21001f, -0.53204f,
-  -0.06752f, -0.84741f, -0.53442f, -0.16284f, 0.54027f,  0.13586f,  -0.42001f,
-  0.85388f,  0.08300f,  -0.89325f, -1.73681f, -0.70473f, 0.23151f,  0.69549f,
-  0.72124f,  0.12769f,
+  0.46633f,  1.55328f,  -0.11230f, -0.29571f, 0.18814f,  -1.52430f, -2.34660f,
+  0.08644f,  -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f,
+  -1.57129f, 0.96021f,  1.34192f,  1.28623f,  1.21655f,  1.28758f,  1.25482f,
+  1.30195f,  1.19190f,  0.09310f,  0.52072f,  0.91487f,  1.24100f,  1.61236f,
+  1.72166f,  2.20750f,  1.62379f,  -1.43936f, 0.50665f,  0.40213f,  0.66502f,
+  -1.66699f, -3.07618f, 0.05877f,  0.60987f,  -0.09995f, -0.10916f, 0.48049f,
+  0.23812f,  0.39847f,  -0.21682f, -0.63455f, 0.33453f,  -0.67939f, -4.14355f,
+  -0.62756f, -0.22502f, -0.17215f, 0.01062f,  0.27049f,  -0.10748f, 0.30945f,
+  2.72445f,  -0.89181f, -0.06800f, 0.20595f,  -0.73385f, 0.04071f,  -1.30294f,
+  1.83507f,  0.92570f,  0.69609f,  0.76285f,  0.69892f,  0.76409f,  0.63104f,
+  0.73397f,  1.09575f,  -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f,
+  -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f,
+  -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f,  -0.75580f, -0.65263f,
+  -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f,  -0.53611f,
+  0.19752f,  -0.16842f, -0.24828f, 0.21857f,  0.08222f,  -2.55894f, -1.75702f,
+  0.11394f,  1.03083f,  0.79972f,  -1.54112f, -1.82341f, -0.57597f, -0.02077f,
+  -0.39616f, -0.00995f, -0.12809f, 0.01188f,  -0.25117f, 0.09202f,  0.09336f,
+  -0.05614f, -0.30039f, 0.25834f,  1.19944f,  1.22533f,  0.92330f,  0.75967f,
+  -0.81945f, -0.41647f,
 };
 
 static const float av1_tx_type_nn_bias_16x8_ver_layer0[16] = {
-  -1.15644f, -0.31062f, 0.20697f,  -0.60304f, -1.19498f, 0.21451f,
-  -0.42825f, -0.71800f, -0.25816f, 1.47408f,  -0.24423f, -1.45773f,
-  -0.55834f, -0.36938f, 1.56759f,  0.07238f,
+  0.17841f,  0.67315f,  -1.24450f, 3.13859f,  0.16203f, -0.14992f,
+  0.29553f,  -1.15567f, -0.71421f, 1.15977f,  1.14585f, 3.02460f,
+  -0.04510f, 0.48000f,  -0.09354f, -0.42422f,
 };
 
 static const float av1_tx_type_nn_weights_16x8_ver_layer1[64] = {
-  -1.45227f, -0.67141f, 0.75237f,  0.32681f,  -0.70528f, -0.76730f, -0.49777f,
-  0.02418f,  0.25096f,  1.14840f,  0.23548f,  0.48755f,  0.33164f,  0.21050f,
-  1.41651f,  -0.28888f, -0.76668f, 0.04439f,  0.67538f,  -1.06438f, 0.68128f,
-  0.95824f,  0.08530f,  -0.03635f, 0.06820f,  1.38621f,  -0.50424f, -1.72992f,
-  -0.20949f, 0.13400f,  0.93366f,  -0.05324f, 1.41593f,  -0.75119f, -1.80912f,
-  1.05440f,  0.62580f,  -0.30867f, -0.07025f, -0.34654f, 0.13621f,  1.74426f,
-  -0.22417f, 0.47031f,  -0.08142f, 0.10151f,  0.42498f,  0.06635f,  1.50623f,
-  1.04130f,  0.85107f,  0.23382f,  0.69800f,  1.10856f,  1.18767f,  -0.69395f,
-  -0.07985f, 0.50412f,  0.46019f,  0.49214f,  0.44219f,  -0.09502f, 0.75745f,
-  0.99208f,
+  0.29912f,  -0.10009f, -1.11478f, 1.76812f,  -0.27719f, 0.52148f,  0.17622f,
+  -1.17116f, 0.73397f,  -0.69279f, -0.11080f, 1.53751f,  -1.42003f, 0.14731f,
+  0.13592f,  -0.04883f, 0.39186f,  -0.13655f, -0.43994f, 1.82759f,  -0.25601f,
+  -0.15018f, 0.51920f,  -1.56070f, 0.31683f,  -0.79367f, -0.02904f, 1.28637f,
+  -1.15203f, 0.26627f,  0.42828f,  -0.24258f, 0.38647f,  -0.83352f, 0.32553f,
+  2.09522f,  -0.26822f, -0.42191f, 0.32825f,  -1.30748f, 1.50551f,  -0.52669f,
+  0.20045f,  1.69318f,  -1.47839f, 0.30802f,  -0.07290f, -0.28106f, 0.68192f,
+  -0.15522f, 1.12579f,  2.21921f,  0.09720f,  -0.50265f, 0.83165f,  -1.31721f,
+  0.72422f,  -1.24952f, 0.61653f,  2.04117f,  -1.42406f, 0.52568f,  -0.46180f,
+  -0.00873f,
 };
 
 static const float av1_tx_type_nn_bias_16x8_ver_layer1[4] = {
-  0.68774f,
-  0.88572f,
-  0.77462f,
-  3.05667f,
+  3.34981f,
+  3.74710f,
+  1.38339f,
+  0.45176f,
 };
 
 static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = {
@@ -609,14 +664,9 @@ static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = {
   {
       16,
   },  // num_hidden_nodes
-  {
-      av1_tx_type_nn_weights_16x8_ver_layer0,
-      av1_tx_type_nn_weights_16x8_ver_layer1,
-  },
-  {
-      av1_tx_type_nn_bias_16x8_ver_layer0,
-      av1_tx_type_nn_bias_16x8_ver_layer1,
-  },
+  { av1_tx_type_nn_weights_16x8_ver_layer0,
+    av1_tx_type_nn_weights_16x8_ver_layer1 },
+  { av1_tx_type_nn_bias_16x8_ver_layer0, av1_tx_type_nn_bias_16x8_ver_layer1 }
 };
 /******************************************************************************/
 
@@ -687,445 +737,253 @@ static const NN_CONFIG av1_tx_type_nnconfig_16x16 = {
 };
 /******************************************************************************/
 
-// Tx type model for 16x32 block.
-static const float av1_tx_type_nn_weights_16x32_hor_layer0[128] = {
-  0.89821f,  0.90804f,  1.13052f,  0.74855f,  1.02053f,  0.91260f,  0.97102f,
-  0.16808f,  -0.19982f, -0.33296f, -0.22490f, -0.22481f, -0.09332f, -2.44338f,
-  -0.12236f, -0.03158f, -1.43561f, 0.07794f,  0.16586f,  0.09731f,  0.12967f,
-  0.09725f,  -0.16826f, 1.26640f,  0.88004f,  0.27312f,  -0.07993f, 0.33640f,
-  0.11732f,  0.33384f,  0.97066f,  -0.61744f, -0.48545f, 0.44622f,  0.73744f,
-  0.32262f,  -0.05713f, 0.42280f,  1.10378f,  0.18540f,  -2.07906f, 0.11443f,
-  0.37877f,  0.24136f,  -0.12524f, -0.12434f, 0.02116f,  0.11716f,  1.28267f,
-  1.01508f,  1.26184f,  1.22545f,  1.29582f,  1.18855f,  1.27564f,  0.42001f,
-  -0.41481f, 0.06725f,  -0.13133f, -0.24801f, 0.16515f,  0.16228f,  0.35197f,
-  0.53610f,  -0.39805f, -0.32584f, 0.40096f,  0.38621f,  -0.00030f, -0.23434f,
-  0.29149f,  -0.76542f, 0.04996f,  -0.30036f, 1.48687f,  0.90852f,  -0.03083f,
-  -0.15953f, 1.19259f,  0.87690f,  -1.08977f, 0.78757f,  0.81149f,  0.54089f,
-  0.35400f,  0.37919f,  0.84997f,  -0.20449f, 0.39601f,  -0.37596f, 0.64748f,
-  0.26021f,  0.37354f,  0.23593f,  0.16335f,  1.70681f,  0.31800f,  -0.00964f,
-  0.82687f,  -0.78372f, -1.47438f, 0.32410f,  1.37436f,  0.07476f,  -0.40574f,
-  -0.10353f, -0.79300f, 0.74381f,  -0.15601f, -0.14380f, -0.20961f, -0.52697f,
-  0.04669f,  -0.00870f, 0.05624f,  -0.09036f, 0.25701f,  0.30336f,  0.24199f,
-  0.45579f,  0.66330f,  -1.81834f, 0.74965f,  1.22747f,  0.25072f,  0.25100f,
-  0.43289f,  -0.00362f,
-};
-
-static const float av1_tx_type_nn_bias_16x32_hor_layer0[16] = {
-  -0.87643f, 0.36754f,  -0.86409f, 1.37761f,  1.22688f,  0.09074f,
-  -1.47139f, -1.06100f, -0.24087f, 1.10382f,  -0.32837f, -1.39592f,
-  -0.14741f, -0.43954f, 1.72137f,  -0.21704f,
-};
-
-static const float av1_tx_type_nn_weights_16x32_hor_layer1[64] = {
-  -0.81860f, -0.80745f, -0.43612f, 0.58656f,  0.37455f, -0.56519f, -1.71536f,
-  0.23278f,  0.23951f,  1.09610f,  0.49986f,  0.43375f, -0.53182f, 0.17376f,
-  1.05626f,  -0.61743f, -0.71777f, -0.66943f, 1.40091f, 0.34426f,  1.14295f,
-  0.45571f,  -0.52504f, -0.00303f, 0.06044f,  0.66119f, -0.60340f, -1.14344f,
-  -0.28045f, 0.12742f,  0.61484f,  -0.41016f, 1.36102f, -0.86969f, -0.52728f,
-  1.01725f,  0.67083f,  -0.10138f, 1.36406f,  0.34066f, 0.12498f,  0.86595f,
-  -0.39636f, -0.27888f, -0.40244f, 0.09847f,  0.81178f, -0.45313f, 1.39127f,
-  0.99865f,  -0.57908f, 0.55072f,  0.49638f,  1.11524f, 1.85504f,  -0.28316f,
-  -0.05195f, -0.23284f, 0.26461f,  -1.28120f, 0.60707f, -0.06110f, 0.74085f,
-  0.63304f,
-};
-
-static const float av1_tx_type_nn_bias_16x32_hor_layer1[4] = {
-  0.71765f,
-  1.40400f,
-  0.32221f,
-  3.07234f,
-};
-
-static const NN_CONFIG av1_tx_type_nnconfig_16x32_hor = {
+// Tx type model for 4x16 block.
+static const float av1_tx_type_nn_weights_4x16_hor_layer0[32] = {
+  0.36539f,  0.25667f,  0.01491f,  -0.21959f, 2.55105f,  0.17615f, 1.79884f,
+  1.65936f,  -0.44363f, 0.00706f,  -0.68004f, -0.64360f, 1.75760f, 1.91906f,
+  1.47682f,  0.09650f,  -3.59244f, -0.35004f, 0.93295f,  0.25806f, -0.08154f,
+  0.79332f,  0.79535f,  1.09467f,  1.57855f,  -0.51359f, 0.90553f, -1.67744f,
+  -1.74563f, -0.88830f, -1.77603f, 2.15935f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_hor_layer0[8] = {
+  -0.36435f, -2.22731f, -0.00837f, -1.34546f,
+  0.62806f,  -0.20675f, 4.91940f,  -0.56079f,
+};
+
+static const float av1_tx_type_nn_weights_4x16_hor_layer1[32] = {
+  -0.57191f, -1.46418f, 0.67331f,  -1.15027f, 0.46288f,  0.81251f,  2.51768f,
+  -0.27147f, 0.00761f,  -2.15214f, -0.69650f, -0.50808f, 0.92832f,  0.45668f,
+  2.34201f,  -0.52941f, 0.51008f,  -1.55496f, -0.01371f, -0.12356f, 0.66624f,
+  0.88043f,  2.64862f,  -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f,
+  1.28413f,  -0.30326f, 2.45329f,  -0.83335f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_hor_layer1[4] = {
+  2.33198f,
+  3.36245f,
+  1.62603f,
+  2.91056f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x16_hor = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x16_hor_layer0,
+    av1_tx_type_nn_weights_4x16_hor_layer1 },
+  { av1_tx_type_nn_bias_4x16_hor_layer0, av1_tx_type_nn_bias_4x16_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x16_ver_layer0[128] = {
+  1.61392f,  1.41239f,  1.47646f,  1.47325f,  1.46110f,  1.49208f,  1.49414f,
+  0.12835f,  -0.76986f, 0.07087f,  -0.24572f, -0.93168f, 3.07935f,  -0.18183f,
+  -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f,  -0.38711f,
+  -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f,
+  -0.03305f, -4.07069f, -2.76643f, 0.04413f,  -1.03176f, -0.19217f, -0.44980f,
+  -2.48615f, -2.58112f, -0.87695f, 0.16187f,  -0.04891f, -0.06854f, 1.08104f,
+  0.75245f,  1.49302f,  0.63363f,  1.45715f,  0.92574f,  1.72029f,  0.33326f,
+  3.86646f,  0.04422f,  0.41019f,  0.36212f,  0.56600f,  -1.01552f, 0.05128f,
+  0.40454f,  -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f,
+  -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f,  0.41010f,  0.31223f,
+  -0.43382f, -0.74715f, 2.03366f,  -0.30419f, 0.45747f,  0.09526f,  0.31678f,
+  0.22915f,  0.21832f,  1.26385f,  -0.06814f, -0.71417f, -1.18947f, 0.03762f,
+  0.10936f,  2.97396f,  -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f,
+  0.05173f,  -0.44274f, -0.15738f, 0.11311f,  0.43872f,  0.16837f,  -0.52849f,
+  2.90050f,  -0.54735f, -0.29591f, 1.24030f,  0.21696f,  -0.04443f, -1.60877f,
+  -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f,
+  0.42820f,  -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f,  2.23477f,
+  0.01370f,  -0.20426f, -1.51411f, -0.72293f, 0.64516f,  0.97638f,  0.32616f,
+  -0.27975f, -0.01149f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_ver_layer0[16] = {
+  -1.37863f, -0.05763f, -0.07041f, 0.15306f,  0.96026f,  -1.42105f,
+  -0.55822f, 1.04845f,  -0.17662f, -1.25345f, -0.11927f, 0.49845f,
+  -0.32530f, 0.73483f,  0.08322f,  -0.23890f,
+};
+
+static const float av1_tx_type_nn_weights_4x16_ver_layer1[64] = {
+  0.27194f,  0.50607f,  0.49229f,  -0.48192f, 0.15667f,  -1.38891f, 0.38102f,
+  -0.58825f, -0.07337f, -0.52909f, 0.36975f,  0.28710f,  0.34992f,  -0.73630f,
+  0.30386f,  -0.58822f, 0.36127f,  0.57950f,  0.55878f,  -0.42796f, 0.19967f,
+  -1.45517f, 0.42529f,  -0.54630f, -0.38169f, -0.84899f, 0.41622f,  0.46935f,
+  0.39077f,  -0.75448f, 0.31698f,  -0.76187f, 0.97765f,  0.57052f,  0.55825f,
+  -0.54273f, 0.20466f,  -1.46347f, 0.41813f,  -0.55019f, -0.19948f, -0.57982f,
+  0.41206f,  0.32373f,  0.38537f,  -1.11657f, 0.32887f,  -0.76911f, 1.12259f,
+  0.72163f,  0.82603f,  0.37786f,  0.34976f,  -1.86642f, 0.59961f,  -0.16329f,
+  -0.36631f, -0.56814f, 0.60410f,  0.53158f,  0.56389f,  -0.70508f, 0.51009f,
+  -0.56513f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_ver_layer1[4] = {
+  4.60896f,
+  4.53551f,
+  4.53124f,
+  4.27435f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x16_ver = {
   8,  // num_inputs
   4,  // num_outputs
   1,  // num_hidden_layers
   {
       16,
   },  // num_hidden_nodes
-  {
-      av1_tx_type_nn_weights_16x32_hor_layer0,
-      av1_tx_type_nn_weights_16x32_hor_layer1,
-  },
-  {
-      av1_tx_type_nn_bias_16x32_hor_layer0,
-      av1_tx_type_nn_bias_16x32_hor_layer1,
-  },
+  { av1_tx_type_nn_weights_4x16_ver_layer0,
+    av1_tx_type_nn_weights_4x16_ver_layer1 },
+  { av1_tx_type_nn_bias_4x16_ver_layer0, av1_tx_type_nn_bias_4x16_ver_layer1 }
 };
+/******************************************************************************/
 
-static const float av1_tx_type_nn_weights_16x32_ver_layer0[512] = {
-  -0.01219f, 0.51494f,  0.25450f,  0.45788f,  -0.87277f, 0.32954f,  -0.04851f,
-  -0.24321f, -0.40000f, 0.21915f,  0.14108f,  0.98268f,  0.18989f,  0.54298f,
-  0.36349f,  0.38931f,  1.08124f,  0.87199f,  1.03553f,  1.14777f,  1.04254f,
-  1.11336f,  0.92198f,  0.84715f,  1.89363f,  1.21587f,  0.72377f,  1.25097f,
-  0.84231f,  0.95529f,  1.12346f,  0.19113f,  -0.04559f, 0.56859f,  0.59747f,
-  0.60176f,  0.82465f,  0.59009f,  0.67240f,  1.58674f,  -0.92951f, -0.23449f,
-  0.11923f,  -0.19151f, -0.15914f, 0.03146f,  -0.16541f, 0.17181f,  -0.21834f,
-  0.21906f,  0.96708f,  0.36085f,  -0.42380f, -2.25681f, -0.48812f, 0.72875f,
-  0.06585f,  0.18818f,  -0.02109f, -0.10996f, 0.00187f,  -0.02078f, 0.04484f,
-  -0.07171f, 0.94773f,  -0.33466f, 0.28484f,  0.14791f,  0.30274f,  0.13377f,
-  0.40970f,  0.45133f,  1.69265f,  -0.36422f, -0.15889f, 0.07670f,  0.44675f,
-  -0.28665f, -0.07097f, 1.03803f,  -0.83274f, -0.24571f, 0.08039f,  -0.23790f,
-  -0.23276f, -0.28031f, 0.26451f,  -0.18513f, -2.23336f, -0.62073f, 0.32495f,
-  -0.67644f, -0.08559f, -0.36382f, -0.24515f, -0.01899f, 0.09012f,  0.19723f,
-  0.04017f,  0.31624f,  0.58369f,  0.30411f,  -0.81165f, -2.58541f, -0.20491f,
-  0.68089f,  -0.14799f, 0.13925f,  0.12867f,  0.15229f,  0.06887f,  -0.03784f,
-  0.02288f,  -0.28712f, 0.14107f,  0.29485f,  -0.11662f, 0.25239f,  0.30311f,
-  -0.07377f, -0.10962f, 0.59856f,  0.47967f,  0.01847f,  -0.27889f, 0.46786f,
-  0.18118f,  0.09355f,  -2.10076f, 0.38823f,  0.28202f,  0.29104f,  0.86977f,
-  0.52377f,  0.21161f,  0.72888f,  -0.00952f, 0.15982f,  -0.14651f, 0.28763f,
-  -0.14155f, 0.00093f,  0.08351f,  0.34685f,  -0.22066f, 0.20378f,  0.25416f,
-  0.03423f,  -0.11068f, -0.41612f, 0.56913f,  -0.06697f, -0.12585f, -0.21033f,
-  -0.14513f, -0.04477f, -0.35778f, 0.03437f,  0.06956f,  -0.25356f, -1.46010f,
-  -0.08142f, 0.11926f,  -0.63551f, -0.13882f, 0.34164f,  0.10821f,  1.07323f,
-  -0.62435f, -0.27116f, 0.25971f,  0.11952f,  -0.39480f, -0.05474f, -0.12582f,
-  0.28289f,  0.13723f,  0.58369f,  0.41865f,  0.28574f,  1.01357f,  0.46661f,
-  0.61717f,  0.85708f,  -0.03930f, -0.38013f, -0.33888f, -0.20561f, -0.19087f,
-  -0.01041f, 0.12119f,  -0.20786f, 0.55915f,  0.67511f,  0.55554f,  0.56540f,
-  0.76647f,  0.54766f,  0.45166f,  0.61384f,  0.95407f,  -0.06811f, -0.62132f,
-  0.12713f,  0.63713f,  2.04090f,  1.17054f,  0.00469f,  -0.93692f, -0.24136f,
-  -0.04281f, -0.15787f, 0.37956f,  -0.09174f, -0.72494f, 0.55285f,  -1.40996f,
-  -0.54077f, 0.38445f,  -0.08258f, 0.64259f,  -0.54058f, -0.49865f, 1.41371f,
-  0.89014f,  0.78788f,  0.37919f,  0.87447f,  -0.00760f, -0.00947f, 0.16323f,
-  -0.36632f, -1.38115f, -0.24619f, 0.40490f,  -0.08871f, -0.25365f, -0.60842f,
-  0.11128f,  0.18658f,  -0.86001f, -0.28271f, 0.39572f,  -0.29930f, -0.10110f,
-  0.33706f,  0.21731f,  0.15383f,  -0.01707f, 0.02812f,  0.31192f,  0.39742f,
-  0.38260f,  -0.48263f, 0.57385f,  0.53239f,  -0.60013f, -0.63211f, -0.45140f,
-  -0.73520f, -0.95260f, -0.70633f, -0.96190f, 0.01747f,  -0.05195f, -0.07138f,
-  -1.09535f, -0.63548f, -1.55700f, -0.35721f, -0.18923f, 0.77568f,  0.09419f,
-  0.36919f,  -0.32761f, -0.06597f, -0.38988f, -0.43674f, -0.24284f, 0.36906f,
-  0.28414f,  0.19273f,  -0.68516f, 0.09514f,  -0.45381f, 0.19917f,  -0.32377f,
-  1.32549f,  0.08244f,  -0.64405f, 0.13195f,  2.85307f,  0.47631f,  -0.33408f,
-  0.04168f,  0.18585f,  -0.18029f, 0.07986f,  -0.08816f, -0.00703f, -0.01515f,
-  -0.13164f, 0.00571f,  0.05676f,  1.51425f,  0.73360f,  0.43486f,  -0.08223f,
-  -0.06183f, -0.57098f, -0.29948f, 0.05945f,  0.19238f,  -0.47980f, -0.35902f,
-  -0.19931f, 0.43443f,  0.67436f,  0.78573f,  0.25703f,  1.01863f,  0.99047f,
-  0.95228f,  1.02429f,  1.19264f,  0.29935f,  -0.26583f, -0.98749f, -0.46167f,
-  -0.29727f, -0.10515f, -0.39790f, -0.59321f, -0.61925f, -0.95452f, 0.04292f,
-  -0.48273f, -0.91195f, -0.45971f, -0.46355f, -0.88319f, -0.51712f, -0.47682f,
-  -0.86110f, -0.59178f, -0.57163f, -0.94946f, 0.19627f,  -0.18699f, 0.11037f,
-  1.39110f,  0.05715f,  3.00762f,  1.52243f,  0.25028f,  0.12779f,  -0.12871f,
-  0.04764f,  0.08288f,  -0.16572f, -0.06580f, 0.05845f,  -0.01474f, 0.04886f,
-  -0.10000f, 0.12911f,  -0.01416f, -0.12472f, 0.14358f,  0.16554f,  0.08853f,
-  0.13418f,  -0.05408f, -0.13871f, -0.00049f, 0.20725f,  -0.05603f, 0.27885f,
-  -0.14277f, 0.29653f,  -0.24739f, 0.10101f,  -0.17068f, -2.43802f, 0.41834f,
-  0.49784f,  0.34949f,  0.98487f,  0.16792f,  1.07355f,  0.32546f,  1.32377f,
-  -0.08584f, 0.85214f,  -0.05721f, 0.90307f,  0.20167f,  0.52664f,  -0.14478f,
-  0.64997f,  0.06846f,  0.32475f,  0.64453f,  0.70143f,  -0.03091f, -0.24958f,
-  -0.39021f, -0.57693f, -0.18319f, 0.11793f,  -0.05948f, 0.36670f,  -0.27932f,
-  0.14800f,  -0.55459f, -0.89673f, 0.65922f,  0.54308f,  -0.16731f, -0.59731f,
-  -0.20705f, -0.18183f, -0.05645f, -0.06829f, -0.40210f, -0.27955f, 0.28054f,
-  0.57665f,  0.14171f,  0.54693f,  -0.22144f, -0.59664f, 0.13295f,  0.07057f,
-  -0.19698f, 0.03328f,  -0.09687f, -0.32390f, -0.11506f, -0.40406f, -0.11473f,
-  0.10399f,  -0.29824f, 0.16028f,  0.00053f,  0.22699f,  0.04203f,  -0.43880f,
-  -0.12654f, 0.12172f,  0.21087f,  -0.46350f, -0.22081f, -0.06173f, -0.23287f,
-  0.90314f,  0.04466f,  -0.06149f, 0.32682f,  0.16609f,  -0.58991f, -0.03786f,
-  -0.41329f, 0.02632f,  0.23411f,  0.25344f,  0.16468f,  0.31007f,  0.21845f,
-  0.32462f,  0.33945f,  0.11527f,  -0.35926f, -0.18584f, 0.29340f,  0.78199f,
-  2.39287f,  0.53838f,  -1.55085f, 0.02238f,  -0.26153f, -0.42498f, -0.02460f,
-  0.19261f,  -0.10870f, -0.08453f, -0.39561f, 0.08600f,  0.36310f,  0.58439f,
-  -0.59526f, 0.13104f,  -0.06703f, -0.17529f, -0.41431f, -0.23121f, -0.32394f,
-  -0.33324f, -0.21405f, -0.41702f, -0.29236f, -0.31766f, -0.33512f, -0.22679f,
-  -0.13680f, -0.00118f, -1.81744f, -2.34798f, -1.08048f, -0.29883f, -0.29123f,
-  -0.01752f,
-};
-
-static const float av1_tx_type_nn_bias_16x32_ver_layer0[32] = {
-  1.02458f,  -1.02185f, -0.18978f, 0.05981f,  -0.94931f, 0.34544f,  0.04415f,
-  -0.60036f, -0.11368f, -0.14154f, 1.23438f,  0.51640f,  -0.57587f, -0.91380f,
-  0.95720f,  0.68298f,  -0.06353f, -2.14960f, -0.11080f, 0.79380f,  -0.94199f,
-  0.43040f,  0.01358f,  0.07201f,  -0.49689f, -0.14839f, -0.80132f, -0.13925f,
-  -0.11834f, -0.24998f, -0.33976f, 0.35497f,
-};
-
-static const float av1_tx_type_nn_weights_16x32_ver_layer1[128] = {
-  0.87367f,  -1.06469f, -0.50829f, -0.70540f, 1.14596f,  -1.12346f, -0.94467f,
-  0.01380f,  -0.18911f, 0.07961f,  -0.18626f, 0.61902f,  -0.64423f, 1.21545f,
-  1.01149f,  0.26309f,  1.50380f,  1.93940f,  -0.64064f, 1.03987f,  -1.88000f,
-  -0.44574f, -1.53303f, 1.36307f,  1.00292f,  0.37031f,  0.21594f,  0.16758f,
-  0.02592f,  -0.77431f, -0.31797f, -1.53826f, 1.14013f,  -1.21957f, 0.04571f,
-  -0.22168f, 0.32299f,  0.25949f,  -0.13306f, 0.17850f,  0.92494f,  0.19999f,
-  0.07494f,  -0.03362f, -0.53453f, 1.02970f,  -0.22947f, 0.73964f,  1.08445f,
-  0.16855f,  -0.02686f, 0.25254f,  0.05952f,  0.02194f,  0.05649f,  0.39195f,
-  0.14139f,  0.53843f,  -0.06959f, -0.06993f, -0.14151f, -0.53147f, 0.17481f,
-  -1.21977f, 0.62932f,  1.07173f,  0.24049f,  -0.51574f, 0.97492f,  -0.28169f,
-  -0.15406f, -0.05441f, -0.25415f, 0.16583f,  0.43674f,  -0.00593f, -0.09277f,
-  0.61402f,  1.35562f,  -0.03926f, 0.18967f,  -0.29548f, -0.55509f, 0.23661f,
-  0.05023f,  0.36226f,  -0.83314f, 0.39357f,  0.19943f,  -0.63431f, -0.03847f,
-  0.12213f,  0.62024f,  -0.11704f, -0.22483f, 0.96624f,  0.18518f,  0.09181f,
-  -0.63068f, 0.66797f,  0.74107f,  0.40624f,  0.70636f,  -0.06921f, 0.34175f,
-  -0.15513f, 2.07844f,  0.22126f,  0.52919f,  0.26793f,  -0.50018f, 1.10549f,
-  0.10970f,  0.05831f,  0.82842f,  -1.22975f, 1.78377f,  0.92679f,  2.01480f,
-  -1.19011f, -0.53381f, 0.38533f,  0.45579f,  -0.10683f, -0.40828f, 0.31398f,
-  0.14978f,  0.91325f,
-};
-
-static const float av1_tx_type_nn_bias_16x32_ver_layer1[4] = {
-  1.03659f,
-  1.80249f,
-  1.25710f,
-  1.32000f,
-};
-
-static const NN_CONFIG av1_tx_type_nnconfig_16x32_ver = {
-  16,  // num_inputs
-  4,   // num_outputs
-  1,   // num_hidden_layers
+// Tx type model for 16x4 block.
+static const float av1_tx_type_nn_weights_16x4_hor_layer0[128] = {
+  1.45347f,  -0.15743f, 0.44236f,  0.25808f,  0.33944f,  0.38678f,  0.24428f,
+  1.67287f,  0.09539f,  -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f,
+  -0.49183f, 0.09333f,  -0.99026f, -0.22157f, 0.53701f,  0.60447f,  0.15686f,
+  -0.04646f, 0.26341f,  2.12361f,  0.27090f,  -1.14716f, -0.64146f, -0.91604f,
+  -0.75335f, -0.60056f, -1.25084f, 1.68473f,  -3.24075f, -4.03867f, -2.07877f,
+  -0.02347f, 0.00333f,  -0.01259f, -0.00465f, 0.02526f,  0.36286f,  -0.10324f,
+  2.12780f,  -0.74584f, -1.05052f, 1.78467f,  -0.55065f, -0.03326f, 2.46781f,
+  1.18349f,  0.96015f,  1.01696f,  1.10584f,  1.07263f,  1.11531f,  -1.06413f,
+  0.32389f,  -1.87360f, -0.14435f, 1.77926f,  1.09966f,  -0.12680f, -0.61386f,
+  -0.09724f, -0.33095f, 1.12122f,  1.00791f,  1.52416f,  1.35004f,  1.32657f,
+  0.60950f,  -1.13538f, -0.38654f, 0.06473f,  2.10669f,  0.27734f,  -0.38359f,
+  -1.91455f, -1.22676f, 0.05786f,  0.97432f,  2.19967f,  0.50457f,  0.78976f,
+  0.95183f,  -0.32414f, 0.49437f,  -0.04506f, 0.18993f,  -0.07971f, 0.23889f,
+  -0.09872f, -0.66036f, 0.05377f,  2.69638f,  -0.08259f, -0.69210f, -1.08296f,
+  -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f,
+  -0.30732f, -0.12043f, 0.11126f,  0.10771f,  -0.14956f, -0.02218f, 0.41016f,
+  1.16599f,  1.14629f,  1.12881f,  1.18676f,  1.24677f,  1.28695f,  1.11270f,
+  0.08233f,  1.75440f,  0.49228f,  -0.34858f, -0.17032f, 0.29288f,  0.47175f,
+  0.19055f,  -1.56413f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_hor_layer0[16] = {
+  -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f,  -0.21451f,
+  2.75281f,  0.04318f, 2.03965f,  0.14618f,  -0.70483f, -0.24517f,
+  1.14048f,  0.33308f, -1.10886f, 0.41184f,
+};
+
+static const float av1_tx_type_nn_weights_16x4_hor_layer1[64] = {
+  -1.17079f, 0.19096f,  -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f,
+  0.05972f,  1.44759f,  -0.04068f, -0.26331f, 0.31400f,  0.96923f,  0.33443f,
+  -0.77215f, -0.91316f, -1.78928f, 0.21483f,  -1.24008f, -0.46190f, -0.12127f,
+  -0.62144f, 1.37593f,  0.08373f,  1.56215f,  0.00279f,  -0.14556f, 0.38710f,
+  0.96228f,  0.66433f,  -0.51798f, -0.80738f, -0.18539f, 0.19377f,  -1.03090f,
+  -1.51044f, -0.59485f, -0.62589f, 1.90742f,  0.09078f,  1.49113f,  0.00205f,
+  -0.15918f, 0.40827f,  1.08553f,  0.43431f,  0.33519f,  -1.12669f, -1.10274f,
+  0.80004f,  -1.83599f, -0.53134f, 2.00515f,  -0.32670f, 1.37124f,  0.51136f,
+  1.62563f,  0.24787f,  0.31757f,  0.81751f,  1.57262f,  0.83214f,  1.04661f,
+  -0.43819f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_hor_layer1[4] = {
+  2.32575f,
+  2.75703f,
+  1.12304f,
+  2.15567f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x4_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
   {
-      32,
+      16,
   },  // num_hidden_nodes
-  {
-      av1_tx_type_nn_weights_16x32_ver_layer0,
-      av1_tx_type_nn_weights_16x32_ver_layer1,
-  },
-  {
-      av1_tx_type_nn_bias_16x32_ver_layer0,
-      av1_tx_type_nn_bias_16x32_ver_layer1,
-  },
+  { av1_tx_type_nn_weights_16x4_hor_layer0,
+    av1_tx_type_nn_weights_16x4_hor_layer1 },
+  { av1_tx_type_nn_bias_16x4_hor_layer0, av1_tx_type_nn_bias_16x4_hor_layer1 }
 };
-/******************************************************************************/
 
-// Tx type model for 32x16 block.
-static const float av1_tx_type_nn_weights_32x16_hor_layer0[512] = {
-  -0.07289f, 0.30798f,  0.41881f,  0.33434f,  -0.01599f, 0.85307f,  -0.16060f,
-  -0.07922f, -0.04693f, 0.29186f,  0.44117f,  1.02417f,  0.12447f,  0.46321f,
-  0.40060f,  0.50140f,  0.48338f,  0.47298f,  0.36585f,  0.42821f,  0.41289f,
-  0.47534f,  0.42900f,  0.26061f,  0.45887f,  0.38163f,  0.17302f,  1.00888f,
-  1.79910f,  1.36140f,  0.24471f,  0.04557f,  1.10823f,  0.74325f,  0.91210f,
-  0.81387f,  0.98865f,  -0.09874f, 0.55146f,  0.19385f,  -0.50752f, -0.17249f,
-  0.27261f,  -0.02763f, -0.03286f, 0.09122f,  0.07015f,  0.20012f,  0.68983f,
-  -1.25345f, -0.00145f, 0.71567f,  0.54948f,  -0.56154f, -0.28918f, 0.11997f,
-  -0.09907f, 0.09195f,  0.05768f,  0.15558f,  0.11284f,  -0.35195f, -0.08723f,
-  -0.03571f, 0.94031f,  0.63737f,  0.98202f,  0.93826f,  0.87126f,  0.88530f,
-  0.97697f,  0.55283f,  0.58670f,  0.86502f,  0.97008f,  0.99709f,  0.66214f,
-  0.96660f,  0.99890f,  0.31945f,  -1.00301f, 0.13215f,  -0.03950f, 0.21148f,
-  0.05128f,  0.10955f,  0.44839f,  -0.33438f, -2.09773f, 0.13908f,  0.58669f,
-  0.25268f,  -0.24006f, 0.01286f,  -0.05732f, 0.03401f,  -0.06896f, 0.35397f,
-  0.05133f,  -0.21449f, -0.38437f, -0.32326f, -0.38731f, -0.44419f, 0.25968f,
-  -0.29422f, -0.12553f, -0.08896f, -0.16400f, -0.22309f, 0.21380f,  -0.26912f,
-  0.06866f,  -0.25694f, 0.17632f,  0.32032f,  -0.10666f, 0.26278f,  0.31877f,
-  -0.09338f, -0.14289f, 0.54232f,  0.46070f,  0.00059f,  -0.27914f, 0.45177f,
-  0.16274f,  -0.08811f, -0.45791f, 0.53946f,  -0.16794f, 0.16229f,  0.11840f,
-  -0.24435f, 0.26894f,  -0.33180f, -0.47314f, 0.34061f,  -0.13939f, 0.13321f,
-  -0.05208f, -0.18139f, -0.35234f, 1.37298f,  -0.19360f, 0.21728f,  0.26088f,
-  0.04045f,  -0.10763f, -0.40470f, 0.50026f,  -0.06726f, -0.12871f, -0.20963f,
-  -0.14583f, -0.04711f, -0.35988f, 0.03091f,  0.06491f,  -0.31668f, -0.52190f,
-  0.23397f,  -0.13984f, -0.15207f, -0.49977f, 0.51205f,  0.12559f,  -0.03631f,
-  0.33447f,  -0.36684f, 0.17533f,  0.15671f,  -0.00096f, 0.06817f,  0.20922f,
-  0.34006f,  0.71260f,  0.45024f,  0.53033f,  0.15645f,  0.76019f,  0.56870f,
-  0.83066f,  0.63022f,  1.74436f,  -0.24798f, 0.06795f,  -0.00749f, 0.17795f,
-  0.10371f,  0.06527f,  0.41054f,  0.49003f,  0.34630f,  0.02615f,  0.30320f,
-  -0.47133f, -0.49584f, 0.21775f,  0.27530f,  -0.29977f, -0.64269f, 0.52627f,
-  -0.02492f, 0.08077f,  0.40786f,  -0.36015f, -0.70714f, -1.98185f, -0.28187f,
-  0.35018f,  -0.06105f, -0.12710f, 0.06606f,  -0.27805f, 0.44630f,  -0.84731f,
-  -0.26699f, 0.25856f,  0.06194f,  -0.18674f, -0.11560f, -0.43277f, 1.10579f,
-  0.95876f,  0.17415f,  0.56386f,  0.68426f,  0.50180f,  0.24844f,  0.12347f,
-  0.15281f,  -0.19089f, 0.52279f,  0.41860f,  -0.05270f, -0.17029f, -0.03542f,
-  0.10621f,  -0.25088f, 0.24070f,  -0.08951f, 0.29950f,  -0.36720f, 0.02151f,
-  0.20129f,  -0.70066f, -0.23144f, -0.20070f, -0.39262f, -0.01597f, -0.05591f,
-  0.23814f,  -0.25991f, 0.05812f,  0.60554f,  -0.06106f, -0.58326f, 0.28762f,
-  -0.18747f, 0.08232f,  -0.04243f, -0.03293f, 0.14722f,  -0.13017f, -0.67263f,
-  0.38698f,  -0.18207f, -0.11496f, -0.27976f, -0.55345f, 1.42872f,  0.04684f,
-  0.04214f,  0.00030f,  0.02410f,  0.19966f,  -0.04246f, 0.00442f,  0.23121f,
-  0.13364f,  0.21548f,  -0.12748f, -0.14066f, -0.28354f, 0.59937f,  -0.27553f,
-  1.57503f,  -0.01050f, -0.17724f, 0.44110f,  -0.80334f, 0.72064f,  1.00501f,
-  -0.72638f, 0.02774f,  0.48540f,  -0.72016f, -0.27721f, 0.31559f,  0.07322f,
-  0.20279f,  -0.19647f, 0.02352f,  0.12662f,  0.19743f,  0.30543f,  0.25712f,
-  0.44702f,  0.16417f,  0.17888f,  -2.58469f, 0.20555f,  0.57782f,  -0.10892f,
-  0.14527f,  0.82251f,  0.04200f,  0.44626f,  0.10818f,  0.71204f,  0.62903f,
-  0.69178f,  0.73603f,  0.52717f,  0.83020f,  0.48824f,  1.03270f,  -0.00152f,
-  0.07958f,  0.24181f,  -0.78839f, -0.74214f, -0.72998f, -1.58694f, 0.17735f,
-  0.56318f,  0.32580f,  -0.58503f, -0.33673f, -0.00838f, 0.48924f,  0.43362f,
-  0.12750f,  0.00295f,  0.38624f,  0.17037f,  0.00729f,  -0.26256f, -0.41669f,
-  0.36847f,  0.22424f,  1.33334f,  0.18112f,  0.37682f,  0.49173f,  -0.45240f,
-  -0.04857f, -0.35038f, -0.83099f, -0.01988f, 0.03497f,  0.38033f,  0.13685f,
-  0.17597f,  0.28668f,  0.31193f,  -0.43281f, 0.43267f,  -0.50495f, 0.01969f,
-  0.14131f,  -0.09326f, -0.39425f, -0.62048f, -0.09119f, -0.28306f, -0.52671f,
-  -0.38584f, -0.10953f, 0.19669f,  0.34540f,  -0.49941f, 0.04605f,  -0.43535f,
-  0.27519f,  0.03659f,  -0.31961f, 0.13330f,  0.87009f,  0.20101f,  -0.70392f,
-  -0.27883f, 0.33874f,  -0.34308f, 0.67760f,  0.88195f,  0.55752f,  -0.26563f,
-  0.17875f,  0.06964f,  0.87607f,  1.47616f,  0.46747f,  -0.56408f, -0.39352f,
-  -0.16427f, -0.41185f, 0.14187f,  0.19265f,  -0.58613f, 0.56345f,  -0.17729f,
-  -0.11320f, 0.08752f,  -0.01329f, 1.20981f,  0.45170f,  -0.20571f, -0.01150f,
-  0.26476f,  0.13508f,  0.22020f,  -0.42684f, -0.22499f, -1.51212f, 0.86648f,
-  0.21776f,  0.24666f,  0.71339f,  0.42742f,  -0.00952f, 0.14762f,  0.07693f,
-  -0.19599f, 0.03075f,  -0.09703f, -0.32483f, -0.11616f, -0.40461f, -0.11693f,
-  0.10038f,  -0.30038f, 0.14686f,  0.00548f,  0.20350f,  0.00763f,  -0.43756f,
-  -0.01997f, 0.00902f,  0.07470f,  -0.41441f, -0.20605f, 0.07626f,  -0.34973f,
-  0.47455f,  -0.15251f, -0.05325f, 0.04964f,  0.32477f,  -0.54604f, 0.25273f,
-  -0.18461f, -0.30841f, 0.64908f,  0.60752f,  0.64148f,  0.72788f,  0.71232f,
-  0.58597f,  0.73017f,  0.58857f,  0.71908f,  0.59860f,  0.61849f,  0.99398f,
-  0.39572f,  -0.36165f, -1.88646f, 0.14384f,  -0.60541f, -0.21380f, -0.55498f,
-  -0.50960f, -0.08801f, 0.51892f,  0.19126f,  0.57879f,  1.19447f,  0.25673f,
-  -0.21631f, -0.43562f, -0.27497f, -0.02206f, -0.56169f, 0.58952f,  -0.60983f,
-  -0.64088f, -0.69087f, -0.56261f, -0.74089f, -0.65063f, -0.66978f, -0.60836f,
-  -0.92770f, -0.77182f, -1.61031f, -0.70007f, -0.68402f, -0.42242f, -0.66722f,
-  -0.14533f,
-};
-
-static const float av1_tx_type_nn_bias_32x16_hor_layer0[32] = {
-  1.53781f,  -0.49320f, -0.31646f, 0.02826f,  -1.05554f, 0.06559f,  -0.12399f,
-  -0.61671f, -0.28956f, -0.15419f, 0.87189f,  -0.43375f, -1.08477f, -0.66006f,
-  0.36233f,  0.82678f,  -0.92342f, -1.47101f, -0.02937f, -0.16497f, -0.75457f,
-  0.50173f,  -0.07560f, 0.71598f,  1.50795f,  -0.04745f, -0.14008f, -0.18510f,
-  -0.14988f, -0.67044f, 0.79659f,  0.70610f,
-};
-
-static const float av1_tx_type_nn_weights_32x16_hor_layer1[128] = {
-  0.84983f,  -0.62530f, -0.82600f, -0.52563f, -0.11942f, -0.50279f, -0.13425f,
-  -0.02850f, 0.50767f,  0.10252f,  0.24540f,  0.67748f,  -0.43483f, -0.22242f,
-  0.23431f,  0.57287f,  0.69560f,  1.13814f,  -0.47427f, -0.55858f, -1.47072f,
-  0.26587f,  -0.36335f, 0.83060f,  1.01645f,  -0.52895f, -0.11614f, 0.17390f,
-  -0.13664f, -0.83098f, -0.07985f, -1.36820f, 0.47759f,  -0.55612f, 0.46852f,
-  0.07406f,  -0.80467f, 0.23059f,  0.09992f,  -0.06164f, 0.13541f,  0.06135f,
-  0.83605f,  -0.53224f, -0.13867f, 0.93838f,  -0.61290f, 0.27732f,  -0.46688f,
-  -0.41810f, 0.12885f,  0.13619f,  -0.24612f, 0.07215f,  0.98866f,  0.10993f,
-  1.05799f,  -0.27146f, -0.00079f, -0.08585f, 0.08322f,  -0.33809f, 0.67598f,
-  -1.06515f, 1.28866f,  0.61028f,  -0.31704f, -0.59905f, 1.62151f,  0.10969f,
-  0.20671f,  -0.17818f, 0.14170f,  0.19322f,  0.30602f,  0.93111f,  0.19011f,
-  -0.45609f, 0.82506f,  0.32936f,  -0.07858f, -0.27106f, -0.31638f, 0.23299f,
-  0.81491f,  0.32584f,  -0.52093f, -0.32472f, 0.53643f,  -0.42605f, 0.01641f,
-  0.09002f,  0.15832f,  -0.08790f, 0.05511f,  1.00730f,  0.46309f,  0.68166f,
-  -0.18835f, 0.64512f,  -1.00540f, 0.86802f,  0.18981f,  -0.06982f, -0.24514f,
-  -0.08027f, 0.61199f,  -0.20830f, 0.72001f,  0.17477f,  0.06511f,  0.00801f,
-  -0.43590f, 0.37257f,  0.70323f,  0.60233f,  1.62541f,  0.74383f,  -0.22254f,
-  -0.33892f, 0.22881f,  0.62817f,  0.68915f,  -0.06417f, 0.00969f,  1.65869f,
-  0.89060f,  0.75948f,
-};
-
-static const float av1_tx_type_nn_bias_32x16_hor_layer1[4] = {
-  0.95359f,
-  1.56043f,
-  1.06017f,
-  2.54520f,
-};
-
-static const NN_CONFIG av1_tx_type_nnconfig_32x16_hor = {
-  16,  // num_inputs
-  4,   // num_outputs
-  1,   // num_hidden_layers
-  {
-      32,
-  },  // num_hidden_nodes
-  {
-      av1_tx_type_nn_weights_32x16_hor_layer0,
-      av1_tx_type_nn_weights_32x16_hor_layer1,
-  },
-  {
-      av1_tx_type_nn_bias_32x16_hor_layer0,
-      av1_tx_type_nn_bias_32x16_hor_layer1,
-  },
+static const float av1_tx_type_nn_weights_16x4_ver_layer0[32] = {
+  0.26047f,  0.99930f,  1.16484f,  -0.28196f, -2.67483f, -0.21456f, -0.16854f,
+  0.46375f,  1.47951f,  1.13735f,  1.12356f,  0.27385f,  0.50978f,  2.09967f,
+  -1.47386f, 0.01950f,  -0.06362f, 0.26014f,  1.04544f,  -0.03099f, 0.07478f,
+  -0.39701f, 0.05545f,  2.73633f,  -0.56305f, -0.02208f, -0.44517f, -0.00897f,
+  -0.17967f, -0.96622f, 0.42635f,  -1.04784f,
 };
 
-static const float av1_tx_type_nn_weights_32x16_ver_layer0[128] = {
-  1.30219f,  1.30548f,  1.33334f,  1.20560f,  1.01572f,  1.38100f,  1.37504f,
-  0.12599f,  -0.96957f, 0.19400f,  0.75734f,  0.11295f,  -0.40447f, -1.53062f,
-  -0.82980f, 0.02168f,  -1.11289f, -0.66861f, -0.83663f, -0.91455f, -0.78618f,
-  -0.87176f, -1.10711f, 0.71207f,  1.49689f,  -0.12715f, 0.29357f,  0.35234f,
-  0.61016f,  0.80708f,  0.83564f,  1.05961f,  -0.99842f, 0.82004f,  0.02638f,
-  0.44606f,  0.32298f,  0.21321f,  0.47290f,  -0.71442f, -2.81050f, -0.02520f,
-  -0.08919f, 0.00369f,  -0.05257f, -0.07011f, -0.16394f, 0.06290f,  0.80086f,
-  0.32349f,  0.47411f,  1.36126f,  1.68162f,  0.91325f,  -0.27495f, 0.00262f,
-  0.06025f,  0.42832f,  0.36965f,  0.38063f,  0.32772f,  0.40914f,  0.44510f,
-  3.02239f,  -1.84077f, 0.49536f,  -0.27340f, -0.10437f, -0.34293f, -0.08047f,
-  -0.29651f, -0.97111f, -0.34187f, 0.52869f,  1.27240f,  1.20306f,  1.19121f,
-  1.28742f,  0.26393f,  -0.62319f, 0.92285f,  -0.08303f, -0.33118f, -0.13053f,
-  0.24875f,  -0.52089f, 0.44691f,  -1.08908f, 1.20921f,  0.36538f,  -0.46792f,
-  -0.18855f, -0.13443f, -0.28472f, -0.10353f, 0.06911f,  0.68519f,  0.08228f,
-  -0.49027f, -0.34381f, 0.04719f,  -0.33298f, 0.72525f,  0.09538f,  -0.29216f,
-  -0.07260f, -0.55827f, 0.54542f,  -0.10144f, -0.09292f, -0.14427f, -0.38361f,
-  -0.41559f, 0.75338f,  -0.04530f, 0.27944f,  0.06932f,  -0.11537f, 0.29568f,
-  1.92155f,  -0.98996f, -0.08841f, 0.49386f,  0.15947f,  0.53290f,  1.46747f,
-  0.59360f,  0.25468f,
-};
-
-static const float av1_tx_type_nn_bias_32x16_ver_layer0[16] = {
-  -1.19673f, 0.33043f,  0.24408f, 0.46221f,  2.00646f, 0.19031f,
-  -0.64944f, -0.43452f, 1.04400f, 1.47371f,  0.52460f, -1.39577f,
-  0.83852f,  -0.25536f, 1.33200f, -0.24444f,
-};
-
-static const float av1_tx_type_nn_weights_32x16_ver_layer1[64] = {
-  -1.31447f, -0.86455f, 0.85217f,  1.00048f,  0.37395f, -1.35713f, -0.54032f,
-  0.82803f,  0.89606f,  1.57696f,  0.68067f,  0.42512f, -0.26250f, 0.14621f,
-  0.93249f,  -0.77690f, -0.93652f, -0.44488f, 0.68360f, -0.88178f, 1.89111f,
-  0.67700f,  -0.29310f, 0.91604f,  -1.21881f, 1.11188f, 0.45045f,  -0.86119f,
-  -0.09294f, 0.09360f,  0.80794f,  0.41027f,  1.80399f, -0.50362f, -1.44689f,
-  0.85148f,  0.90707f,  -0.18458f, 0.14165f,  1.17367f, 0.70869f,  1.57147f,
-  0.24692f,  0.16626f,  0.56794f,  0.07313f,  0.14728f, -0.74296f, 1.74127f,
-  1.26560f,  0.17753f,  1.10194f,  0.56435f,  1.73779f, 1.42841f,  -1.16773f,
-  0.24584f,  0.10813f,  -0.60187f, 0.79802f,  0.75229f, -0.06112f, 1.77282f,
-  1.01058f,
-};
-
-static const float av1_tx_type_nn_bias_32x16_ver_layer1[4] = {
-  0.83082f,
-  2.03845f,
-  0.59627f,
-  2.31341f,
-};
-
-static const NN_CONFIG av1_tx_type_nnconfig_32x16_ver = {
-  8,  // num_inputs
+static const float av1_tx_type_nn_bias_16x4_ver_layer0[8] = {
+  -0.52088f, 0.52844f,  -1.03655f, -0.30974f,
+  2.59952f,  -1.93604f, 0.00000f,  2.51787f,
+};
+
+static const float av1_tx_type_nn_weights_16x4_ver_layer1[32] = {
+  0.10916f,  -0.21219f, -0.51340f, 0.69161f,  1.45988f,  -1.36942f, -0.40899f,
+  1.05136f,  -0.08486f, 0.10008f,  -0.55304f, 0.88012f,  1.61177f,  -1.64507f,
+  0.63428f,  1.15130f,  -0.17287f, -0.18592f, -0.01143f, 0.88293f,  1.73326f,
+  -1.63624f, 0.09359f,  1.18393f,  0.26531f,  0.22378f,  0.15170f,  1.06965f,
+  1.26814f,  -1.93873f, -0.00768f, 1.58309f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_ver_layer1[4] = {
+  2.34713f,
+  1.68667f,
+  1.25488f,
+  1.69812f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x4_ver = {
+  4,  // num_inputs
   4,  // num_outputs
   1,  // num_hidden_layers
   {
-      16,
+      8,
   },  // num_hidden_nodes
-  {
-      av1_tx_type_nn_weights_32x16_ver_layer0,
-      av1_tx_type_nn_weights_32x16_ver_layer1,
-  },
-  {
-      av1_tx_type_nn_bias_32x16_ver_layer0,
-      av1_tx_type_nn_bias_32x16_ver_layer1,
-  },
+  { av1_tx_type_nn_weights_16x4_ver_layer0,
+    av1_tx_type_nn_weights_16x4_ver_layer1 },
+  { av1_tx_type_nn_bias_16x4_ver_layer0, av1_tx_type_nn_bias_16x4_ver_layer1 }
 };
 /******************************************************************************/
 
 // Map tx_size to its corresponding neural net model for tx type prediction.
 static const NN_CONFIG *av1_tx_type_nnconfig_map_hor[] = {
-  &av1_tx_type_nnconfig_4x4,        // 4x4
-  &av1_tx_type_nnconfig_8x8,        // 8x8
-  &av1_tx_type_nnconfig_16x16,      // 16x16
-  NULL,                             // 32x32
-  NULL,                             // 64x64
-  &av1_tx_type_nnconfig_4x8_hor,    // 4x8
-  &av1_tx_type_nnconfig_8x4_hor,    // 8x4
-  &av1_tx_type_nnconfig_8x16_hor,   // 8x16
-  &av1_tx_type_nnconfig_16x8_hor,   // 16x8
-  &av1_tx_type_nnconfig_16x32_hor,  // 16x32
-  &av1_tx_type_nnconfig_32x16_hor,  // 32x16
-  NULL,                             // 32x64
-  NULL,                             // 64x32
-  NULL,                             // 4x16
-  NULL,                             // 16x4
-  NULL,                             // 8x32
-  NULL,                             // 32x8
-  NULL,                             // 16x64
-  NULL,                             // 64x16
+  &av1_tx_type_nnconfig_4x4_hor,   // 4x4 transform
+  &av1_tx_type_nnconfig_8x8_hor,   // 8x8 transform
+  &av1_tx_type_nnconfig_16x16,     // 16x16 transform
+  NULL,                            // 32x32 transform
+  NULL,                            // 64x64 transform
+  &av1_tx_type_nnconfig_4x8_hor,   // 4x8 transform
+  &av1_tx_type_nnconfig_8x4_hor,   // 8x4 transform
+  &av1_tx_type_nnconfig_8x16_hor,  // 8x16 transform
+  &av1_tx_type_nnconfig_16x8_hor,  // 16x8 transform
+  NULL,                            // 16x32 transform
+  NULL,                            // 32x16 transform
+  NULL,                            // 32x64 transform
+  NULL,                            // 64x32 transform
+  &av1_tx_type_nnconfig_4x16_hor,  // 4x16 transform
+  &av1_tx_type_nnconfig_16x4_hor,  // 16x4 transform
+  NULL,                            // 8x32 transform
+  NULL,                            // 32x8 transform
+  NULL,                            // 16x64 transform
+  NULL,                            // 64x16 transform
 };
 
 static const NN_CONFIG *av1_tx_type_nnconfig_map_ver[] = {
-  &av1_tx_type_nnconfig_4x4,        // 4x4 transform
-  &av1_tx_type_nnconfig_8x8,        // 8x8 transform
-  &av1_tx_type_nnconfig_16x16,      // 16x16 transform
-  NULL,                             // 32x32 transform
-  NULL,                             // 64x64 transform
-  &av1_tx_type_nnconfig_4x8_ver,    // 4x8 transform
-  &av1_tx_type_nnconfig_8x4_ver,    // 8x4 transform
-  &av1_tx_type_nnconfig_8x16_ver,   // 8x16 transform
-  &av1_tx_type_nnconfig_16x8_ver,   // 16x8 transform
-  &av1_tx_type_nnconfig_16x32_ver,  // 16x32 transform
-  &av1_tx_type_nnconfig_32x16_ver,  // 32x16 transform
-  NULL,                             // 32x64 transform
-  NULL,                             // 64x32 transform
-  NULL,                             // 4x16 transform
-  NULL,                             // 16x4 transform
-  NULL,                             // 8x32 transform
-  NULL,                             // 32x8 transform
-  NULL,                             // 16x64 transform
-  NULL,                             // 64x16 transform
+  &av1_tx_type_nnconfig_4x4_ver,   // 4x4 transform
+  &av1_tx_type_nnconfig_8x8_ver,   // 8x8 transform
+  &av1_tx_type_nnconfig_16x16,     // 16x16 transform
+  NULL,                            // 32x32 transform
+  NULL,                            // 64x64 transform
+  &av1_tx_type_nnconfig_4x8_ver,   // 4x8 transform
+  &av1_tx_type_nnconfig_8x4_ver,   // 8x4 transform
+  &av1_tx_type_nnconfig_8x16_ver,  // 8x16 transform
+  &av1_tx_type_nnconfig_16x8_ver,  // 16x8 transform
+  NULL,                            // 16x32 transform
+  NULL,                            // 32x16 transform
+  NULL,                            // 32x64 transform
+  NULL,                            // 64x32 transform
+  &av1_tx_type_nnconfig_4x16_ver,  // 4x16 transform
+  &av1_tx_type_nnconfig_16x4_ver,  // 16x4 transform
+  NULL,                            // 8x32 transform
+  NULL,                            // 32x8 transform
+  NULL,                            // 16x64 transform
+  NULL,                            // 64x16 transform
 };
 
 // Tx split model for 4x8 block.
@@ -2083,4 +1941,4 @@ static const NN_CONFIG *av1_tx_split_nnconfig_map[TX_SIZES_ALL] = {
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+#endif  // AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
index c71f2e74c..07615543c 100644
--- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -395,7 +395,8 @@ void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
 }
 
 void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
-                           int8_t cos_bit) {
+                           int8_t cos_bit, const int instride,
+                           const int outstride) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
 
@@ -480,70 +481,70 @@ void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
 
   // stage 1
   __m128i x1[64];
-  x1[0] = _mm_add_epi32(input[0], input[63]);
-  x1[63] = _mm_sub_epi32(input[0], input[63]);
-  x1[1] = _mm_add_epi32(input[1], input[62]);
-  x1[62] = _mm_sub_epi32(input[1], input[62]);
-  x1[2] = _mm_add_epi32(input[2], input[61]);
-  x1[61] = _mm_sub_epi32(input[2], input[61]);
-  x1[3] = _mm_add_epi32(input[3], input[60]);
-  x1[60] = _mm_sub_epi32(input[3], input[60]);
-  x1[4] = _mm_add_epi32(input[4], input[59]);
-  x1[59] = _mm_sub_epi32(input[4], input[59]);
-  x1[5] = _mm_add_epi32(input[5], input[58]);
-  x1[58] = _mm_sub_epi32(input[5], input[58]);
-  x1[6] = _mm_add_epi32(input[6], input[57]);
-  x1[57] = _mm_sub_epi32(input[6], input[57]);
-  x1[7] = _mm_add_epi32(input[7], input[56]);
-  x1[56] = _mm_sub_epi32(input[7], input[56]);
-  x1[8] = _mm_add_epi32(input[8], input[55]);
-  x1[55] = _mm_sub_epi32(input[8], input[55]);
-  x1[9] = _mm_add_epi32(input[9], input[54]);
-  x1[54] = _mm_sub_epi32(input[9], input[54]);
-  x1[10] = _mm_add_epi32(input[10], input[53]);
-  x1[53] = _mm_sub_epi32(input[10], input[53]);
-  x1[11] = _mm_add_epi32(input[11], input[52]);
-  x1[52] = _mm_sub_epi32(input[11], input[52]);
-  x1[12] = _mm_add_epi32(input[12], input[51]);
-  x1[51] = _mm_sub_epi32(input[12], input[51]);
-  x1[13] = _mm_add_epi32(input[13], input[50]);
-  x1[50] = _mm_sub_epi32(input[13], input[50]);
-  x1[14] = _mm_add_epi32(input[14], input[49]);
-  x1[49] = _mm_sub_epi32(input[14], input[49]);
-  x1[15] = _mm_add_epi32(input[15], input[48]);
-  x1[48] = _mm_sub_epi32(input[15], input[48]);
-  x1[16] = _mm_add_epi32(input[16], input[47]);
-  x1[47] = _mm_sub_epi32(input[16], input[47]);
-  x1[17] = _mm_add_epi32(input[17], input[46]);
-  x1[46] = _mm_sub_epi32(input[17], input[46]);
-  x1[18] = _mm_add_epi32(input[18], input[45]);
-  x1[45] = _mm_sub_epi32(input[18], input[45]);
-  x1[19] = _mm_add_epi32(input[19], input[44]);
-  x1[44] = _mm_sub_epi32(input[19], input[44]);
-  x1[20] = _mm_add_epi32(input[20], input[43]);
-  x1[43] = _mm_sub_epi32(input[20], input[43]);
-  x1[21] = _mm_add_epi32(input[21], input[42]);
-  x1[42] = _mm_sub_epi32(input[21], input[42]);
-  x1[22] = _mm_add_epi32(input[22], input[41]);
-  x1[41] = _mm_sub_epi32(input[22], input[41]);
-  x1[23] = _mm_add_epi32(input[23], input[40]);
-  x1[40] = _mm_sub_epi32(input[23], input[40]);
-  x1[24] = _mm_add_epi32(input[24], input[39]);
-  x1[39] = _mm_sub_epi32(input[24], input[39]);
-  x1[25] = _mm_add_epi32(input[25], input[38]);
-  x1[38] = _mm_sub_epi32(input[25], input[38]);
-  x1[26] = _mm_add_epi32(input[26], input[37]);
-  x1[37] = _mm_sub_epi32(input[26], input[37]);
-  x1[27] = _mm_add_epi32(input[27], input[36]);
-  x1[36] = _mm_sub_epi32(input[27], input[36]);
-  x1[28] = _mm_add_epi32(input[28], input[35]);
-  x1[35] = _mm_sub_epi32(input[28], input[35]);
-  x1[29] = _mm_add_epi32(input[29], input[34]);
-  x1[34] = _mm_sub_epi32(input[29], input[34]);
-  x1[30] = _mm_add_epi32(input[30], input[33]);
-  x1[33] = _mm_sub_epi32(input[30], input[33]);
-  x1[31] = _mm_add_epi32(input[31], input[32]);
-  x1[32] = _mm_sub_epi32(input[31], input[32]);
+  x1[0] = _mm_add_epi32(input[0 * instride], input[63 * instride]);
+  x1[63] = _mm_sub_epi32(input[0 * instride], input[63 * instride]);
+  x1[1] = _mm_add_epi32(input[1 * instride], input[62 * instride]);
+  x1[62] = _mm_sub_epi32(input[1 * instride], input[62 * instride]);
+  x1[2] = _mm_add_epi32(input[2 * instride], input[61 * instride]);
+  x1[61] = _mm_sub_epi32(input[2 * instride], input[61 * instride]);
+  x1[3] = _mm_add_epi32(input[3 * instride], input[60 * instride]);
+  x1[60] = _mm_sub_epi32(input[3 * instride], input[60 * instride]);
+  x1[4] = _mm_add_epi32(input[4 * instride], input[59 * instride]);
+  x1[59] = _mm_sub_epi32(input[4 * instride], input[59 * instride]);
+  x1[5] = _mm_add_epi32(input[5 * instride], input[58 * instride]);
+  x1[58] = _mm_sub_epi32(input[5 * instride], input[58 * instride]);
+  x1[6] = _mm_add_epi32(input[6 * instride], input[57 * instride]);
+  x1[57] = _mm_sub_epi32(input[6 * instride], input[57 * instride]);
+  x1[7] = _mm_add_epi32(input[7 * instride], input[56 * instride]);
+  x1[56] = _mm_sub_epi32(input[7 * instride], input[56 * instride]);
+  x1[8] = _mm_add_epi32(input[8 * instride], input[55 * instride]);
+  x1[55] = _mm_sub_epi32(input[8 * instride], input[55 * instride]);
+  x1[9] = _mm_add_epi32(input[9 * instride], input[54 * instride]);
+  x1[54] = _mm_sub_epi32(input[9 * instride], input[54 * instride]);
+  x1[10] = _mm_add_epi32(input[10 * instride], input[53 * instride]);
+  x1[53] = _mm_sub_epi32(input[10 * instride], input[53 * instride]);
+  x1[11] = _mm_add_epi32(input[11 * instride], input[52 * instride]);
+  x1[52] = _mm_sub_epi32(input[11 * instride], input[52 * instride]);
+  x1[12] = _mm_add_epi32(input[12 * instride], input[51 * instride]);
+  x1[51] = _mm_sub_epi32(input[12 * instride], input[51 * instride]);
+  x1[13] = _mm_add_epi32(input[13 * instride], input[50 * instride]);
+  x1[50] = _mm_sub_epi32(input[13 * instride], input[50 * instride]);
+  x1[14] = _mm_add_epi32(input[14 * instride], input[49 * instride]);
+  x1[49] = _mm_sub_epi32(input[14 * instride], input[49 * instride]);
+  x1[15] = _mm_add_epi32(input[15 * instride], input[48 * instride]);
+  x1[48] = _mm_sub_epi32(input[15 * instride], input[48 * instride]);
+  x1[16] = _mm_add_epi32(input[16 * instride], input[47 * instride]);
+  x1[47] = _mm_sub_epi32(input[16 * instride], input[47 * instride]);
+  x1[17] = _mm_add_epi32(input[17 * instride], input[46 * instride]);
+  x1[46] = _mm_sub_epi32(input[17 * instride], input[46 * instride]);
+  x1[18] = _mm_add_epi32(input[18 * instride], input[45 * instride]);
+  x1[45] = _mm_sub_epi32(input[18 * instride], input[45 * instride]);
+  x1[19] = _mm_add_epi32(input[19 * instride], input[44 * instride]);
+  x1[44] = _mm_sub_epi32(input[19 * instride], input[44 * instride]);
+  x1[20] = _mm_add_epi32(input[20 * instride], input[43 * instride]);
+  x1[43] = _mm_sub_epi32(input[20 * instride], input[43 * instride]);
+  x1[21] = _mm_add_epi32(input[21 * instride], input[42 * instride]);
+  x1[42] = _mm_sub_epi32(input[21 * instride], input[42 * instride]);
+  x1[22] = _mm_add_epi32(input[22 * instride], input[41 * instride]);
+  x1[41] = _mm_sub_epi32(input[22 * instride], input[41 * instride]);
+  x1[23] = _mm_add_epi32(input[23 * instride], input[40 * instride]);
+  x1[40] = _mm_sub_epi32(input[23 * instride], input[40 * instride]);
+  x1[24] = _mm_add_epi32(input[24 * instride], input[39 * instride]);
+  x1[39] = _mm_sub_epi32(input[24 * instride], input[39 * instride]);
+  x1[25] = _mm_add_epi32(input[25 * instride], input[38 * instride]);
+  x1[38] = _mm_sub_epi32(input[25 * instride], input[38 * instride]);
+  x1[26] = _mm_add_epi32(input[26 * instride], input[37 * instride]);
+  x1[37] = _mm_sub_epi32(input[26 * instride], input[37 * instride]);
+  x1[27] = _mm_add_epi32(input[27 * instride], input[36 * instride]);
+  x1[36] = _mm_sub_epi32(input[27 * instride], input[36 * instride]);
+  x1[28] = _mm_add_epi32(input[28 * instride], input[35 * instride]);
+  x1[35] = _mm_sub_epi32(input[28 * instride], input[35 * instride]);
+  x1[29] = _mm_add_epi32(input[29 * instride], input[34 * instride]);
+  x1[34] = _mm_sub_epi32(input[29 * instride], input[34 * instride]);
+  x1[30] = _mm_add_epi32(input[30 * instride], input[33 * instride]);
+  x1[33] = _mm_sub_epi32(input[30 * instride], input[33 * instride]);
+  x1[31] = _mm_add_epi32(input[31 * instride], input[32 * instride]);
+  x1[32] = _mm_sub_epi32(input[31 * instride], input[32 * instride]);
 
   // stage 2
   __m128i x2[64];
@@ -1149,68 +1150,68 @@ void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
                           x10[48], __rounding, cos_bit);
 
   // stage 11
-  output[0] = x10[0];
-  output[1] = x10[32];
-  output[2] = x10[16];
-  output[3] = x10[48];
-  output[4] = x10[8];
-  output[5] = x10[40];
-  output[6] = x10[24];
-  output[7] = x10[56];
-  output[8] = x10[4];
-  output[9] = x10[36];
-  output[10] = x10[20];
-  output[11] = x10[52];
-  output[12] = x10[12];
-  output[13] = x10[44];
-  output[14] = x10[28];
-  output[15] = x10[60];
-  output[16] = x10[2];
-  output[17] = x10[34];
-  output[18] = x10[18];
-  output[19] = x10[50];
-  output[20] = x10[10];
-  output[21] = x10[42];
-  output[22] = x10[26];
-  output[23] = x10[58];
-  output[24] = x10[6];
-  output[25] = x10[38];
-  output[26] = x10[22];
-  output[27] = x10[54];
-  output[28] = x10[14];
-  output[29] = x10[46];
-  output[30] = x10[30];
-  output[31] = x10[62];
-  output[32] = x10[1];
-  output[33] = x10[33];
-  output[34] = x10[17];
-  output[35] = x10[49];
-  output[36] = x10[9];
-  output[37] = x10[41];
-  output[38] = x10[25];
-  output[39] = x10[57];
-  output[40] = x10[5];
-  output[41] = x10[37];
-  output[42] = x10[21];
-  output[43] = x10[53];
-  output[44] = x10[13];
-  output[45] = x10[45];
-  output[46] = x10[29];
-  output[47] = x10[61];
-  output[48] = x10[3];
-  output[49] = x10[35];
-  output[50] = x10[19];
-  output[51] = x10[51];
-  output[52] = x10[11];
-  output[53] = x10[43];
-  output[54] = x10[27];
-  output[55] = x10[59];
-  output[56] = x10[7];
-  output[57] = x10[39];
-  output[58] = x10[23];
-  output[59] = x10[55];
-  output[60] = x10[15];
-  output[61] = x10[47];
-  output[62] = x10[31];
-  output[63] = x10[63];
+  output[0 * outstride] = x10[0];
+  output[1 * outstride] = x10[32];
+  output[2 * outstride] = x10[16];
+  output[3 * outstride] = x10[48];
+  output[4 * outstride] = x10[8];
+  output[5 * outstride] = x10[40];
+  output[6 * outstride] = x10[24];
+  output[7 * outstride] = x10[56];
+  output[8 * outstride] = x10[4];
+  output[9 * outstride] = x10[36];
+  output[10 * outstride] = x10[20];
+  output[11 * outstride] = x10[52];
+  output[12 * outstride] = x10[12];
+  output[13 * outstride] = x10[44];
+  output[14 * outstride] = x10[28];
+  output[15 * outstride] = x10[60];
+  output[16 * outstride] = x10[2];
+  output[17 * outstride] = x10[34];
+  output[18 * outstride] = x10[18];
+  output[19 * outstride] = x10[50];
+  output[20 * outstride] = x10[10];
+  output[21 * outstride] = x10[42];
+  output[22 * outstride] = x10[26];
+  output[23 * outstride] = x10[58];
+  output[24 * outstride] = x10[6];
+  output[25 * outstride] = x10[38];
+  output[26 * outstride] = x10[22];
+  output[27 * outstride] = x10[54];
+  output[28 * outstride] = x10[14];
+  output[29 * outstride] = x10[46];
+  output[30 * outstride] = x10[30];
+  output[31 * outstride] = x10[62];
+  output[32 * outstride] = x10[1];
+  output[33 * outstride] = x10[33];
+  output[34 * outstride] = x10[17];
+  output[35 * outstride] = x10[49];
+  output[36 * outstride] = x10[9];
+  output[37 * outstride] = x10[41];
+  output[38 * outstride] = x10[25];
+  output[39 * outstride] = x10[57];
+  output[40 * outstride] = x10[5];
+  output[41 * outstride] = x10[37];
+  output[42 * outstride] = x10[21];
+  output[43 * outstride] = x10[53];
+  output[44 * outstride] = x10[13];
+  output[45 * outstride] = x10[45];
+  output[46 * outstride] = x10[29];
+  output[47 * outstride] = x10[61];
+  output[48 * outstride] = x10[3];
+  output[49 * outstride] = x10[35];
+  output[50 * outstride] = x10[19];
+  output[51 * outstride] = x10[51];
+  output[52 * outstride] = x10[11];
+  output[53 * outstride] = x10[43];
+  output[54 * outstride] = x10[27];
+  output[55 * outstride] = x10[59];
+  output[56 * outstride] = x10[7];
+  output[57 * outstride] = x10[39];
+  output[58 * outstride] = x10[23];
+  output[59 * outstride] = x10[55];
+  output[60 * outstride] = x10[15];
+  output[61 * outstride] = x10[47];
+  output[62 * outstride] = x10[31];
+  output[63 * outstride] = x10[63];
 }
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
index abb95f31e..8ec0256eb 100644
--- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
@@ -14,6 +14,7 @@
 #include "av1/common/enums.h"
 #include "av1/common/av1_txfm.h"
 #include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
 #include "av1/encoder/x86/av1_txfm1d_sse4.h"
 #include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
@@ -52,9 +53,22 @@ static void fdct32_new_sse4_1(const __m128i *input, __m128i *output,
   }
 }
 
+static void fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+                              const int8_t cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 64;
+  const int num_per_128 = 4;
+  int col_num = txfm_size / num_per_128;
+  (void)stage_range;
+  for (int col = 0; col < col_num; col++) {
+    av1_fdct64_new_sse4_1((input + col), (output + col), cos_bit, col_num,
+                          col_num);
+  }
+}
+
 static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
   switch (txfm_type) {
     case TXFM_TYPE_DCT32: return fdct32_new_sse4_1; break;
+    case TXFM_TYPE_DCT64: return fdct64_new_sse4_1; break;
     default: assert(0);
   }
   return NULL;
@@ -95,6 +109,42 @@ static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
   transpose_32(txfm_size, buf_128, out_128);
 }
 
+static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input,
+                                           int32_t *output, const int stride,
+                                           const TXFM_2D_FLIP_CFG *cfg,
+                                           int32_t *txfm_buf) {
+  assert(cfg->tx_size < TX_SIZES);
+  const int txfm_size = tx_size_wide[cfg->tx_size];
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  __m128i *buf_128 = (__m128i *)txfm_buf;
+  __m128i *out_128 = (__m128i *)output;
+
+  const int num_per_128 = 4;
+  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+  int col_num = txfm_size / num_per_128;
+
+  int16_array_with_stride_to_int32_array_without_stride(input, stride, output,
+                                                        txfm_size);
+  /*col wise transform*/
+  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+  av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  transpose_32(txfm_size, out_128, buf_128);
+
+  /*row wise transform*/
+  for (int col = 0; col < (col_num >> 1); col++) {
+    av1_fdct64_new_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row,
+                          col_num, (col_num >> 1));
+  }
+
+  txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1);
+  av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+  transpose_32x32(buf_128, out_128);
+}
+
 void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
                                  int stride, TX_TYPE tx_type, int bd) {
   DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
@@ -104,6 +154,15 @@ void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
   fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
 }
 
+void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+  (void)bd;
+  fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
 static INLINE void transpose_32_4x4x2(int stride, const __m128i *inputA,
                                       const __m128i *inputB, __m128i *output) {
   __m128i temp0 = _mm_unpacklo_epi32(inputA[0], inputA[2]);
@@ -162,8 +221,8 @@ static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
       bufA[j] = _mm_cvtepi16_epi32(buf[j]);
       bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
     }
-    av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row);
-    av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row);
+    av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+    av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
     av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
     av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
 
@@ -209,10 +268,10 @@ static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output,
       bufA[j] = _mm_cvtepi16_epi32(buf[j]);
       bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
     }
-    av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row);
-    av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row);
-    av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
-    av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+    av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+    av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
+    av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+    av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
 
     int32_t *output8 = output + 8 * 32 * i;
     for (int j = 0; j < width_div8; ++j) {
@@ -260,8 +319,8 @@ static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output,
     }
     av1_fdct32_new_sse4_1(bufA, bufA, cos_bit_row);
     av1_fdct32_new_sse4_1(bufB, bufB, cos_bit_row);
-    av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
-    av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+    av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+    av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
 
     int32_t *output8 = output + 8 * 32 * i;
     for (int j = 0; j < (32 / 4); ++j) {
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
index c582ca0e3..38707137c 100644
--- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_FWD_TXFM_AVX2_H_
-#define AV1_FWD_TXFM_AVX2_H_
+#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
+#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
 #include <immintrin.h>
 
 static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) {
@@ -100,4 +100,4 @@ static INLINE void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1,
   *in1 = _mm256_srai_epi32(temp1, cos_bit);
 }
 
-#endif  // AV1_FWD_TXFM_AVX2_H_
+#endif  // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
index aa14d3ade..99a6b9082 100644
--- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
@@ -8,8 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_
-#define AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_
+#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
+#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
 
 #include <immintrin.h>
 
@@ -114,4 +114,4 @@ static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = {
 }
 #endif
 
-#endif  // AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_
+#endif  // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
index 0adefecdb..6df2a8bdb 100644
--- a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
+++ b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_TXMF1D_SSE2_H_
-#define AV1_TXMF1D_SSE2_H_
+#ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
+#define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
 
 #include <smmintrin.h>
 #include "av1/common/av1_txfm.h"
@@ -29,7 +29,8 @@ void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
 void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
                            int8_t cos_bit);
 void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
-                           int8_t cos_bit);
+                           int8_t cos_bit, const int instride,
+                           const int outstride);
 
 void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
                            const int8_t cos_bit, const int8_t *stage_range);
@@ -138,4 +139,4 @@ static INLINE void transpose_32(int txfm_size, const __m128i *input,
 }
 #endif
 
-#endif  // AV1_TXMF1D_SSE2_H_
+#endif  // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_avx2.c b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c
new file mode 100644
index 000000000..7642f57d1
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <smmintrin.h>  /* SSE4.1 */
+#include <immintrin.h>  /* AVX2 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width,
+                              const int height, uint8_t *const levels) {
+  const int stride = width + TX_PAD_HOR;
+  const __m256i y_zeros = _mm256_setzero_si256();
+
+  const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride;
+  uint8_t *pre_buf = levels - TX_PAD_TOP * stride;
+  uint8_t *pre_buf_end = pre_buf + pre_len;
+  do {
+    yy_storeu_256(pre_buf, y_zeros);
+    pre_buf += 32;
+  } while (pre_buf < pre_buf_end);
+
+  const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+  uint8_t *bottom_buf_end = levels + (height + TX_PAD_BOTTOM) * stride;
+  uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31));
+
+  do {
+    yy_storeu_256(bottom_buf, y_zeros);
+    bottom_buf += 32;
+  } while (bottom_buf < bottom_buf_end);
+
+  int i = 0;
+  uint8_t *ls = levels;
+  const tran_low_t *cf = coeff;
+  if (width == 4) {
+    do {
+      const __m256i c0 = yy_loadu_256(cf);
+      const __m256i c1 = yy_loadu_256(cf + 8);
+      const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1));
+      const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros);
+      const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8);
+      const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8);
+      yy_storeu_256(ls, res);
+      ls += 32;
+      cf += 16;
+      i += 4;
+    } while (i < height);
+  } else if (width == 8) {
+    do {
+      const __m256i coeffA = yy_loadu_256(cf);
+      const __m256i coeffB = yy_loadu_256(cf + 8);
+      const __m256i coeffC = yy_loadu_256(cf + 16);
+      const __m256i coeffD = yy_loadu_256(cf + 24);
+      const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+      const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+      const __m256i absAB = _mm256_abs_epi16(coeffAB);
+      const __m256i absCD = _mm256_abs_epi16(coeffCD);
+      const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+      const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+      const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+      const __m128i res0 = _mm256_castsi256_si128(res);
+      const __m128i res1 = _mm256_extracti128_si256(res, 1);
+      xx_storel_64(ls, res0);
+      *(int32_t *)(ls + width) = 0;
+      xx_storel_64(ls + stride, _mm_srli_si128(res0, 8));
+      *(int32_t *)(ls + width + stride) = 0;
+      xx_storel_64(ls + stride * 2, res1);
+      *(int32_t *)(ls + width + stride * 2) = 0;
+      xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8));
+      *(int32_t *)(ls + width + stride * 3) = 0;
+      cf += 32;
+      ls += stride << 2;
+      i += 4;
+    } while (i < height);
+  } else if (width == 16) {
+    do {
+      const __m256i coeffA = yy_loadu_256(cf);
+      const __m256i coeffB = yy_loadu_256(cf + 8);
+      const __m256i coeffC = yy_loadu_256(cf + 16);
+      const __m256i coeffD = yy_loadu_256(cf + 24);
+      const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+      const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+      const __m256i absAB = _mm256_abs_epi16(coeffAB);
+      const __m256i absCD = _mm256_abs_epi16(coeffCD);
+      const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+      const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+      const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+      xx_storeu_128(ls, _mm256_castsi256_si128(res));
+      xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1));
+      cf += 32;
+      *(int32_t *)(ls + width) = 0;
+      *(int32_t *)(ls + stride + width) = 0;
+      ls += stride << 1;
+      i += 2;
+    } while (i < height);
+  } else {
+    do {
+      const __m256i coeffA = yy_loadu_256(cf);
+      const __m256i coeffB = yy_loadu_256(cf + 8);
+      const __m256i coeffC = yy_loadu_256(cf + 16);
+      const __m256i coeffD = yy_loadu_256(cf + 24);
+      const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+      const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+      const __m256i absAB = _mm256_abs_epi16(coeffAB);
+      const __m256i absCD = _mm256_abs_epi16(coeffCD);
+      const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+      const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+      const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+      yy_storeu_256(ls, res);
+      cf += 32;
+      *(int32_t *)(ls + width) = 0;
+      ls += stride;
+      i += 1;
+    } while (i < height);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
index b3a879b0f..5e0687cd3 100644
--- a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
+++ b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
@@ -14,43 +14,55 @@
 #include <smmintrin.h>  /* SSE4.1 */
 
 #include "aom/aom_integer.h"
-#include "aom_dsp/x86/mem_sse2.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
 
 void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
                                 const int height, uint8_t *const levels) {
   const int stride = width + TX_PAD_HOR;
-  memset(levels - TX_PAD_TOP * stride, 0,
-         sizeof(*levels) * TX_PAD_TOP * stride);
-  memset(levels + stride * height, 0,
-         sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
-
   const __m128i zeros = _mm_setzero_si128();
+
+  const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride;
+  uint8_t *pre_buf = levels - TX_PAD_TOP * stride;
+  uint8_t *pre_buf_end = pre_buf + pre_len;
+  do {
+    _mm_storeu_si128((__m128i *)(pre_buf), zeros);
+    pre_buf += 16;
+  } while (pre_buf < pre_buf_end);
+
+  const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+  uint8_t *bottom_buf = levels + stride * height;
+  uint8_t *bottom_buf_end = bottom_buf + bottom_len;
+  do {
+    _mm_storeu_si128((__m128i *)(bottom_buf), zeros);
+    bottom_buf += 16;
+  } while (bottom_buf < bottom_buf_end);
+
   int i = 0;
   uint8_t *ls = levels;
   const tran_low_t *cf = coeff;
   if (width == 4) {
     do {
-      const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
-      const __m128i coeffB = _mm_load_si128((__m128i *)(cf + width));
+      const __m128i coeffA = xx_loadu_128(cf);
+      const __m128i coeffB = xx_loadu_128(cf + 4);
       const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
       const __m128i absAB = _mm_abs_epi16(coeffAB);
       const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
       const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros);
-      _mm_storeu_si128((__m128i *)ls, lsAB);
+      xx_storeu_128(ls, lsAB);
       ls += (stride << 1);
       cf += (width << 1);
       i += 2;
     } while (i < height);
   } else if (width == 8) {
     do {
-      const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
-      const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4));
+      const __m128i coeffA = xx_loadu_128(cf);
+      const __m128i coeffB = xx_loadu_128(cf + 4);
       const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
       const __m128i absAB = _mm_abs_epi16(coeffAB);
       const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
-      _mm_storeu_si128((__m128i *)ls, absAB8);
+      xx_storeu_128(ls, absAB8);
       ls += stride;
       cf += width;
       i += 1;
@@ -59,16 +71,16 @@ void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
     do {
       int j = 0;
       do {
-        const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
-        const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4));
-        const __m128i coeffC = _mm_load_si128((__m128i *)(cf + 8));
-        const __m128i coeffD = _mm_load_si128((__m128i *)(cf + 12));
+        const __m128i coeffA = xx_loadu_128(cf);
+        const __m128i coeffB = xx_loadu_128(cf + 4);
+        const __m128i coeffC = xx_loadu_128(cf + 8);
+        const __m128i coeffD = xx_loadu_128(cf + 12);
         const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
         const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD);
         const __m128i absAB = _mm_abs_epi16(coeffAB);
         const __m128i absCD = _mm_abs_epi16(coeffCD);
         const __m128i absABCD = _mm_packs_epi16(absAB, absCD);
-        _mm_storeu_si128((__m128i *)(ls + j), absABCD);
+        xx_storeu_128(ls + j, absABCD);
         j += 16;
         cf += 16;
       } while (j < width);
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index 4cd6371a6..535485ae8 100644
--- a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -17,6 +17,7 @@
 #include "av1/common/av1_txfm.h"
 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
 #include "aom_dsp/txfm_common.h"
 #include "aom_dsp/x86/txfm_common_sse2.h"
 #include "aom_ports/mem.h"
@@ -393,7 +394,32 @@ static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) {
   _mm_store_si128((__m128i *)(output + 15 * 4), res[15]);
 }
 
-static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+static INLINE void write_buffer_16x8(const __m128i *res, int32_t *output,
+                                     const int stride) {
+  _mm_storeu_si128((__m128i *)(output), res[0]);
+  _mm_storeu_si128((__m128i *)(output + 4), res[1]);
+  _mm_storeu_si128((__m128i *)(output + stride), res[2]);
+  _mm_storeu_si128((__m128i *)(output + stride + 4), res[3]);
+
+  _mm_storeu_si128((__m128i *)(output + (stride * 2)), res[4]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 2) + 4), res[5]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 3)), res[6]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 3) + 4), res[7]);
+
+  _mm_storeu_si128((__m128i *)(output + (stride * 4)), res[8]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 4) + 4), res[9]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 5)), res[10]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 5) + 4), res[11]);
+
+  _mm_storeu_si128((__m128i *)(output + (stride * 6)), res[12]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 6) + 4), res[13]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 7)), res[14]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 7) + 4), res[15]);
+}
+
+static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit,
+                           const int col_num) {
+  (void)(col_num);
   const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
@@ -589,7 +615,9 @@ static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
   out[13] = u[3];  // buf0[3]
 }
 
-static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit,
+                            const int col_num) {
+  (void)(col_num);
   const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
@@ -780,82 +808,82 @@ void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
   switch (tx_type) {
     case DCT_DCT:
       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case ADST_DCT:
       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case DCT_ADST:
       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case ADST_ADST:
       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case FLIPADST_DCT:
       load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case DCT_FLIPADST:
       load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
-      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case FLIPADST_FLIPADST:
       load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case ADST_FLIPADST:
       load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case FLIPADST_ADST:
       load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
       col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
@@ -940,7 +968,26 @@ static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out,
   convert_8x8_to_16x16(in, out);
 }
 
-static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *out,
+                                    int stride, int flipud, int fliplr,
+                                    int shift) {
+  const int16_t *topL = input;
+  const int16_t *botL = input + 8 * stride;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+  }
+
+  load_buffer_8x8(topL, out, stride, flipud, fliplr, shift);
+  load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift);
+}
+
+static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit,
+                             const int col_num) {
   const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
@@ -962,7 +1009,6 @@ static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
   __m128i u[16], v[16], x;
-  const int col_num = 4;
   int col;
 
   // Calculate the column 0, 1, 2, 3
@@ -1226,7 +1272,8 @@ static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
   }
 }
 
-static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit,
+                              const int num_cols) {
   const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
@@ -1271,25 +1318,25 @@ static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
   __m128i u[16], v[16], x, y;
   int col;
 
-  for (col = 0; col < 4; ++col) {
+  for (col = 0; col < num_cols; ++col) {
     // stage 0
     // stage 1
-    u[0] = in[0 * 4 + col];
-    u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]);
-    u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]);
-    u[3] = in[8 * 4 + col];
-    u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]);
-    u[5] = in[12 * 4 + col];
-    u[6] = in[4 * 4 + col];
-    u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]);
-    u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]);
-    u[9] = in[14 * 4 + col];
-    u[10] = in[6 * 4 + col];
-    u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]);
-    u[12] = in[2 * 4 + col];
-    u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]);
-    u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]);
-    u[15] = in[10 * 4 + col];
+    u[0] = in[0 * num_cols + col];
+    u[1] = _mm_sub_epi32(zero, in[15 * num_cols + col]);
+    u[2] = _mm_sub_epi32(zero, in[7 * num_cols + col]);
+    u[3] = in[8 * num_cols + col];
+    u[4] = _mm_sub_epi32(zero, in[3 * num_cols + col]);
+    u[5] = in[12 * num_cols + col];
+    u[6] = in[4 * num_cols + col];
+    u[7] = _mm_sub_epi32(zero, in[11 * num_cols + col]);
+    u[8] = _mm_sub_epi32(zero, in[1 * num_cols + col]);
+    u[9] = in[14 * num_cols + col];
+    u[10] = in[6 * num_cols + col];
+    u[11] = _mm_sub_epi32(zero, in[9 * num_cols + col]);
+    u[12] = in[2 * num_cols + col];
+    u[13] = _mm_sub_epi32(zero, in[13 * num_cols + col]);
+    u[14] = _mm_sub_epi32(zero, in[5 * num_cols + col]);
+    u[15] = in[10 * num_cols + col];
 
     // stage 2
     v[0] = u[0];
@@ -1453,22 +1500,22 @@ static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
     v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
 
     // stage 9
-    out[0 * 4 + col] = v[1];
-    out[1 * 4 + col] = v[14];
-    out[2 * 4 + col] = v[3];
-    out[3 * 4 + col] = v[12];
-    out[4 * 4 + col] = v[5];
-    out[5 * 4 + col] = v[10];
-    out[6 * 4 + col] = v[7];
-    out[7 * 4 + col] = v[8];
-    out[8 * 4 + col] = v[9];
-    out[9 * 4 + col] = v[6];
-    out[10 * 4 + col] = v[11];
-    out[11 * 4 + col] = v[4];
-    out[12 * 4 + col] = v[13];
-    out[13 * 4 + col] = v[2];
-    out[14 * 4 + col] = v[15];
-    out[15 * 4 + col] = v[0];
+    out[0 * num_cols + col] = v[1];
+    out[1 * num_cols + col] = v[14];
+    out[2 * num_cols + col] = v[3];
+    out[3 * num_cols + col] = v[12];
+    out[4 * num_cols + col] = v[5];
+    out[5 * num_cols + col] = v[10];
+    out[6 * num_cols + col] = v[7];
+    out[7 * num_cols + col] = v[8];
+    out[8 * num_cols + col] = v[9];
+    out[9 * num_cols + col] = v[6];
+    out[10 * num_cols + col] = v[11];
+    out[11 * num_cols + col] = v[4];
+    out[12 * num_cols + col] = v[13];
+    out[13 * num_cols + col] = v[2];
+    out[14 * num_cols + col] = v[15];
+    out[15 * num_cols + col] = v[0];
   }
 }
 
@@ -1482,6 +1529,11 @@ static void col_txfm_16x16_rounding(__m128i *in, int shift) {
   col_txfm_8x8_rounding(&in[48], shift);
 }
 
+static void col_txfm_8x16_rounding(__m128i *in, int shift) {
+  col_txfm_8x8_rounding(&in[0], shift);
+  col_txfm_8x8_rounding(&in[16], shift);
+}
+
 static void write_buffer_16x16(const __m128i *in, int32_t *output) {
   const int size_8x8 = 16 * 4;
   write_buffer_8x8(&in[0], output);
@@ -1499,85 +1551,86 @@ void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
   const int8_t *shift = fwd_txfm_shift_ls[TX_16X16];
   const int txw_idx = get_txw_idx(TX_16X16);
   const int txh_idx = get_txh_idx(TX_16X16);
+  const int col_num = 4;
   switch (tx_type) {
     case DCT_DCT:
       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case ADST_DCT:
       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case DCT_ADST:
       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case ADST_ADST:
       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case FLIPADST_DCT:
       load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case DCT_FLIPADST:
       load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
-      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case FLIPADST_FLIPADST:
       load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case ADST_FLIPADST:
       load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case FLIPADST_ADST:
       load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
       col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
@@ -1585,3 +1638,146 @@ void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
   }
   (void)bd;
 }
+
+static INLINE void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) {
+  for (int i = 0; i < size; i += 2) in[30 - i] = out[i];
+  for (int i = 1; i < size; i += 2) in[size - i] = out[i];
+}
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_sse4_1,   // DCT_DCT
+  fadst8x8_sse4_1,  // ADST_DCT
+  fdct8x8_sse4_1,   // DCT_ADST
+  fadst8x8_sse4_1,  // ADST_ADST
+  fadst8x8_sse4_1,  // FLIPADST_DCT
+  fdct8x8_sse4_1,   // DCT_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_FLIPADST
+  fadst8x8_sse4_1,  // ADST_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_ADST
+  NULL,             // IDTX
+  NULL,             // V_DCT
+  NULL,             // H_DCT
+  NULL,             // V_ADST
+  NULL,             // H_ADST
+  NULL,             // V_FLIPADST
+  NULL              // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = {
+  fdct16x16_sse4_1,   // DCT_DCT
+  fdct16x16_sse4_1,   // ADST_DCT
+  fadst16x16_sse4_1,  // DCT_ADST
+  fadst16x16_sse4_1,  // ADST_ADST
+  fdct16x16_sse4_1,   // FLIPADST_DCT
+  fadst16x16_sse4_1,  // DCT_FLIPADST
+  fadst16x16_sse4_1,  // FLIPADST_FLIPADST
+  fadst16x16_sse4_1,  // ADST_FLIPADST
+  fadst16x16_sse4_1,  // FLIPADST_ADST
+  NULL,               // IDTX
+  NULL,               // V_DCT
+  NULL,               // H_DCT
+  NULL,               // V_ADST
+  NULL,               // H_ADST
+  NULL,               // V_FLIPADST
+  NULL                // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = {
+  fdct16x16_sse4_1,   // DCT_DCT
+  fadst16x16_sse4_1,  // ADST_DCT
+  fdct16x16_sse4_1,   // DCT_ADST
+  fadst16x16_sse4_1,  // ADST_ADST
+  fadst16x16_sse4_1,  // FLIPADST_DCT
+  fdct16x16_sse4_1,   // DCT_FLIPADST
+  fadst16x16_sse4_1,  // FLIPADST_FLIPADST
+  fadst16x16_sse4_1,  // ADST_FLIPADST
+  fadst16x16_sse4_1,  // FLIPADST_ADST
+  NULL,               // IDTX
+  NULL,               // V_DCT
+  NULL,               // H_DCT
+  NULL,               // V_ADST
+  NULL,               // H_ADST
+  NULL,               // V_FLIPADST
+  NULL                // H_FLIPADST
+};
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_sse4_1,   // DCT_DCT
+  fdct8x8_sse4_1,   // ADST_DCT
+  fadst8x8_sse4_1,  // DCT_ADST
+  fadst8x8_sse4_1,  // ADST_ADST
+  fdct8x8_sse4_1,   // FLIPADST_DCT
+  fadst8x8_sse4_1,  // DCT_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_FLIPADST
+  fadst8x8_sse4_1,  // ADST_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_ADST
+  NULL,             // IDTX
+  NULL,             // V_DCT
+  NULL,             // H_DCT
+  NULL,             // V_ADST
+  NULL,             // H_ADST
+  NULL,             // V_FLIPADST
+  NULL              // H_FLIPADST
+};
+
+void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff,
+                                int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[32], out[32];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
+  int bit = fwd_cos_bit_col[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < 2; i++) {
+    load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
+    col_txfm(in, in, bit, 0);
+    col_txfm_8x8_rounding(in, -shift[1]);
+    transpose_8x8(in, out + i * 16);
+  }
+
+  if (lr_flip) {
+    flip_buf_sse4_1(in, out, 32);
+    row_txfm(in, out, bit, 2);
+  } else {
+    row_txfm(out, out, bit, 2);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    transpose_8x8(out + i * 16, in);
+    av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2);
+    write_buffer_16x8(in, coeff + i * 8, 16);
+  }
+
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff,
+                                int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[32], out[32];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type];
+  int bit = fwd_cos_bit_col[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
+  col_txfm(in, in, bit, 2);
+  col_txfm_8x16_rounding(in, -shift[1]);
+  transpose_8x8(in, out);
+  transpose_8x8(in + 16, out + 16);
+
+  for (int i = 0; i < 2; i++) {
+    row_txfm(out + i * 16, out, bit, 0);
+    transpose_8x8(out, in);
+    av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2);
+    write_buffer_8x8(in, coeff + i * 64);
+  }
+
+  (void)bd;
+}
diff --git a/third_party/aom/av1/encoder/x86/pickrst_avx2.c b/third_party/aom/av1/encoder/x86/pickrst_avx2.c
new file mode 100644
index 000000000..06aaaa7ee
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/pickrst_avx2.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>  // AVX2
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void acc_stat_avx2(int32_t *dst, const uint8_t *src,
+                                 const __m128i *shuffle, const __m256i *kl) {
+  const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
+  const __m256i d0 = _mm256_madd_epi16(*kl, _mm256_cvtepu8_epi16(s));
+  const __m256i dst0 = yy_loadu_256(dst);
+  const __m256i r0 = _mm256_add_epi32(dst0, d0);
+  yy_storeu_256(dst, r0);
+}
+
+static INLINE void acc_stat_win7_one_line_avx2(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
+    int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+  int j, k, l;
+  const int wiener_win = WIENER_WIN;
+  for (j = h_start; j < h_end; j += 2) {
+    const uint8_t X1 = src[j];
+    const uint8_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    const uint8_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        const uint8_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        const __m256i kl =
+            _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+        acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_win7_opt_avx2(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+    int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const double avg =
+      find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+  int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+  int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t sumX = 0;
+  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_win7_one_line_avx2(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+    }
+    for (k = 0; k < wiener_win; ++k) {
+      for (l = 0; l < wiener_win; ++l) {
+        M_int64[k][l] += M_int32[k][l];
+        M_int32[k][l] = 0;
+      }
+    }
+    for (k = 0; k < WIENER_WIN2; ++k) {
+      for (l = 0; l < WIENER_WIN * 8; ++l) {
+        H_int64[k][l] += H_int32[k][l];
+        H_int32[k][l] = 0;
+      }
+    }
+  }
+
+  const double avg_square_sum = avg * avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+      double *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int64[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+                                   avg * (sumY[k][l] + sumY[n][m]);
+        }
+      }
+    }
+  }
+}
+
+static INLINE void acc_stat_win5_one_line_avx2(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+  int j, k, l;
+  const int wiener_win = WIENER_WIN_CHROMA;
+  for (j = h_start; j < h_end; j += 2) {
+    const uint8_t X1 = src[j];
+    const uint8_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    const uint8_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        const uint8_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        const __m256i kl =
+            _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+        acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_win5_opt_avx2(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+    int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN_CHROMA;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const double avg =
+      find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t sumX = 0;
+  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_win5_one_line_avx2(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+    }
+    for (k = 0; k < wiener_win; ++k) {
+      for (l = 0; l < wiener_win; ++l) {
+        M_int64[k][l] += M_int32[k][l];
+        M_int32[k][l] = 0;
+      }
+    }
+    for (k = 0; k < WIENER_WIN2_CHROMA; ++k) {
+      for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+        H_int64[k][l] += H_int32[k][l];
+        H_int32[k][l] = 0;
+      }
+    }
+  }
+
+  const double avg_square_sum = avg * avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+      double *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int64[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+                                   avg * (sumY[k][l] + sumY[n][m]);
+        }
+      }
+    }
+  }
+}
+
+void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd,
+                            const uint8_t *src, int h_start, int h_end,
+                            int v_start, int v_end, int dgd_stride,
+                            int src_stride, double *M, double *H) {
+  if (wiener_win == WIENER_WIN) {
+    compute_stats_win7_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
+                                dgd_stride, src_stride, M, H);
+  } else if (wiener_win == WIENER_WIN_CHROMA) {
+    compute_stats_win5_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
+                                dgd_stride, src_stride, M, H);
+  } else {
+    av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+                        dgd_stride, src_stride, M, H);
+  }
+}
+
+static INLINE __m256i pair_set_epi16(uint16_t a, uint16_t b) {
+  return _mm256_set1_epi32(
+      (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+int64_t av1_lowbd_pixel_proj_error_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+  int i, j, k;
+  const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+  const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
+  __m256i sum64 = _mm256_setzero_si256();
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  int64_t err = 0;
+  if (params->r[0] > 0 && params->r[1] > 0) {
+    __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]);
+    for (i = 0; i < height; ++i) {
+      __m256i sum32 = _mm256_setzero_si256();
+      for (j = 0; j <= width - 16; j += 16) {
+        const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+        const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+        const __m256i flt0_16b = _mm256_permute4x64_epi64(
+            _mm256_packs_epi32(yy_loadu_256(flt0 + j),
+                               yy_loadu_256(flt0 + j + 8)),
+            0xd8);
+        const __m256i flt1_16b = _mm256_permute4x64_epi64(
+            _mm256_packs_epi32(yy_loadu_256(flt1 + j),
+                               yy_loadu_256(flt1 + j + 8)),
+            0xd8);
+        const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
+        const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0);
+        const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0);
+        const __m256i v0 = _mm256_madd_epi16(
+            xq_coeff, _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
+        const __m256i v1 = _mm256_madd_epi16(
+            xq_coeff, _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
+        const __m256i vr0 =
+            _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+        const __m256i vr1 =
+            _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+        const __m256i e0 = _mm256_sub_epi16(
+            _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+        const __m256i err0 = _mm256_madd_epi16(e0, e0);
+        sum32 = _mm256_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+      const __m256i sum64_0 =
+          _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+      const __m256i sum64_1 =
+          _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+      sum64 = _mm256_add_epi64(sum64, sum64_0);
+      sum64 = _mm256_add_epi64(sum64, sum64_1);
+    }
+  } else if (params->r[0] > 0) {
+    __m256i xq_coeff =
+        pair_set_epi16(xq[0], (-xq[0] * (1 << SGRPROJ_RST_BITS)));
+    for (i = 0; i < height; ++i) {
+      __m256i sum32 = _mm256_setzero_si256();
+      for (j = 0; j <= width - 16; j += 16) {
+        const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+        const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+        const __m256i flt0_16b = _mm256_permute4x64_epi64(
+            _mm256_packs_epi32(yy_loadu_256(flt0 + j),
+                               yy_loadu_256(flt0 + j + 8)),
+            0xd8);
+        const __m256i v0 =
+            _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt0_16b, d0));
+        const __m256i v1 =
+            _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt0_16b, d0));
+        const __m256i vr0 =
+            _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+        const __m256i vr1 =
+            _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+        const __m256i e0 = _mm256_sub_epi16(
+            _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+        const __m256i err0 = _mm256_madd_epi16(e0, e0);
+        sum32 = _mm256_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[0] * (flt0[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      const __m256i sum64_0 =
+          _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+      const __m256i sum64_1 =
+          _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+      sum64 = _mm256_add_epi64(sum64, sum64_0);
+      sum64 = _mm256_add_epi64(sum64, sum64_1);
+    }
+  } else if (params->r[1] > 0) {
+    __m256i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS));
+    for (i = 0; i < height; ++i) {
+      __m256i sum32 = _mm256_setzero_si256();
+      for (j = 0; j <= width - 16; j += 16) {
+        const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+        const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+        const __m256i flt1_16b = _mm256_permute4x64_epi64(
+            _mm256_packs_epi32(yy_loadu_256(flt1 + j),
+                               yy_loadu_256(flt1 + j + 8)),
+            0xd8);
+        const __m256i v0 =
+            _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt1_16b, d0));
+        const __m256i v1 =
+            _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt1_16b, d0));
+        const __m256i vr0 =
+            _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+        const __m256i vr1 =
+            _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+        const __m256i e0 = _mm256_sub_epi16(
+            _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+        const __m256i err0 = _mm256_madd_epi16(e0, e0);
+        sum32 = _mm256_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[1] * (flt1[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt1 += flt1_stride;
+      const __m256i sum64_0 =
+          _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+      const __m256i sum64_1 =
+          _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+      sum64 = _mm256_add_epi64(sum64, sum64_0);
+      sum64 = _mm256_add_epi64(sum64, sum64_1);
+    }
+  } else {
+    __m256i sum32 = _mm256_setzero_si256();
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j <= width - 16; j += 16) {
+        const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+        const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+        const __m256i diff0 = _mm256_sub_epi16(d0, s0);
+        const __m256i err0 = _mm256_madd_epi16(diff0, diff0);
+        sum32 = _mm256_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t e = (int32_t)(dat[k]) - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+    const __m256i sum64_0 =
+        _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+    const __m256i sum64_1 =
+        _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+    sum64 = _mm256_add_epi64(sum64_0, sum64_1);
+  }
+  int64_t sum[4];
+  yy_storeu_256(sum, sum64);
+  err += sum[0] + sum[1] + sum[2] + sum[3];
+  return err;
+}
diff --git a/third_party/aom/av1/encoder/x86/pickrst_sse4.c b/third_party/aom/av1/encoder/x86/pickrst_sse4.c
new file mode 100644
index 000000000..04e4d1afc
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/pickrst_sse4.c
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void acc_stat_sse41(int32_t *dst, const uint8_t *src,
+                                  const __m128i *shuffle, const __m128i *kl) {
+  const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
+  const __m128i d0 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(s));
+  const __m128i d1 =
+      _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(_mm_srli_si128(s, 8)));
+  const __m128i dst0 = xx_loadu_128(dst);
+  const __m128i dst1 = xx_loadu_128(dst + 4);
+  const __m128i r0 = _mm_add_epi32(dst0, d0);
+  const __m128i r1 = _mm_add_epi32(dst1, d1);
+  xx_storeu_128(dst, r0);
+  xx_storeu_128(dst + 4, r1);
+}
+
+static INLINE void acc_stat_win7_one_line_sse4_1(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
+    int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+  const int wiener_win = 7;
+  int j, k, l;
+  for (j = h_start; j < h_end; j += 2) {
+    const uint8_t *dgd_ij = dgd + j;
+    const uint8_t X1 = src[j];
+    const uint8_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        const uint8_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        const __m128i kl =
+            _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+        acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_win7_opt_sse4_1(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+    int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const double avg =
+      find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+  int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+  int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t sumX = 0;
+  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_win7_one_line_sse4_1(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+    }
+    for (k = 0; k < wiener_win; ++k) {
+      for (l = 0; l < wiener_win; ++l) {
+        M_int64[k][l] += M_int32[k][l];
+        M_int32[k][l] = 0;
+      }
+    }
+    for (k = 0; k < WIENER_WIN2; ++k) {
+      for (l = 0; l < WIENER_WIN * 8; ++l) {
+        H_int64[k][l] += H_int32[k][l];
+        H_int32[k][l] = 0;
+      }
+    }
+  }
+
+  const double avg_square_sum = avg * avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+      double *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int64[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+                                   avg * (sumY[k][l] + sumY[n][m]);
+        }
+      }
+    }
+  }
+}
+
+static INLINE void acc_stat_win5_one_line_sse4_1(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+  const int wiener_win = WIENER_WIN_CHROMA;
+  int j, k, l;
+  for (j = h_start; j < h_end; j += 2) {
+    const uint8_t *dgd_ij = dgd + j;
+    const uint8_t X1 = src[j];
+    const uint8_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        const uint8_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        const __m128i kl =
+            _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+        acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_win5_opt_sse4_1(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+    int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN_CHROMA;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const double avg =
+      find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t sumX = 0;
+  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_win5_one_line_sse4_1(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+    }
+    for (k = 0; k < wiener_win; ++k) {
+      for (l = 0; l < wiener_win; ++l) {
+        M_int64[k][l] += M_int32[k][l];
+        M_int32[k][l] = 0;
+      }
+    }
+    for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) {
+      for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+        H_int64[k][l] += H_int32[k][l];
+        H_int32[k][l] = 0;
+      }
+    }
+  }
+
+  const double avg_square_sum = avg * avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+      double *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int64[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+                                   avg * (sumY[k][l] + sumY[n][m]);
+        }
+      }
+    }
+  }
+}
+void av1_compute_stats_sse4_1(int wiener_win, const uint8_t *dgd,
+                              const uint8_t *src, int h_start, int h_end,
+                              int v_start, int v_end, int dgd_stride,
+                              int src_stride, double *M, double *H) {
+  if (wiener_win == WIENER_WIN) {
+    compute_stats_win7_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+                                  dgd_stride, src_stride, M, H);
+  } else if (wiener_win == WIENER_WIN_CHROMA) {
+    compute_stats_win5_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+                                  dgd_stride, src_stride, M, H);
+  } else {
+    av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+                        dgd_stride, src_stride, M, H);
+  }
+}
+
+static INLINE __m128i pair_set_epi16(uint16_t a, uint16_t b) {
+  return _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+int64_t av1_lowbd_pixel_proj_error_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+  int i, j, k;
+  const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+  const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+  __m128i sum64 = _mm_setzero_si128();
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  int64_t err = 0;
+  if (params->r[0] > 0 && params->r[1] > 0) {
+    __m128i xq_coeff = pair_set_epi16(xq[0], xq[1]);
+    for (i = 0; i < height; ++i) {
+      __m128i sum32 = _mm_setzero_si128();
+      for (j = 0; j < width - 8; j += 8) {
+        const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+        const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+        const __m128i flt0_16b =
+            _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4));
+        const __m128i flt1_16b =
+            _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4));
+        const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS);
+        const __m128i flt0_0_sub_u = _mm_sub_epi16(flt0_16b, u0);
+        const __m128i flt1_0_sub_u = _mm_sub_epi16(flt1_16b, u0);
+        const __m128i v0 = _mm_madd_epi16(
+            xq_coeff, _mm_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
+        const __m128i v1 = _mm_madd_epi16(
+            xq_coeff, _mm_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
+        const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+        const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+        const __m128i e0 =
+            _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+        const __m128i err0 = _mm_madd_epi16(e0, e0);
+        sum32 = _mm_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+      const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+      const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+      sum64 = _mm_add_epi64(sum64, sum64_0);
+      sum64 = _mm_add_epi64(sum64, sum64_1);
+    }
+  } else if (params->r[0] > 0) {
+    __m128i xq_coeff = pair_set_epi16(xq[0], -(xq[0] << SGRPROJ_RST_BITS));
+    for (i = 0; i < height; ++i) {
+      __m128i sum32 = _mm_setzero_si128();
+      for (j = 0; j < width - 8; j += 8) {
+        const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+        const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+        const __m128i flt0_16b =
+            _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4));
+        const __m128i v0 =
+            _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt0_16b, d0));
+        const __m128i v1 =
+            _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt0_16b, d0));
+        const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+        const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+        const __m128i e0 =
+            _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+        const __m128i err0 = _mm_madd_epi16(e0, e0);
+        sum32 = _mm_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[0] * (flt0[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+      const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+      sum64 = _mm_add_epi64(sum64, sum64_0);
+      sum64 = _mm_add_epi64(sum64, sum64_1);
+    }
+  } else if (params->r[1] > 0) {
+    __m128i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS));
+    for (i = 0; i < height; ++i) {
+      __m128i sum32 = _mm_setzero_si128();
+      for (j = 0; j < width - 8; j += 8) {
+        const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+        const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+        const __m128i flt1_16b =
+            _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4));
+        const __m128i v0 =
+            _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt1_16b, d0));
+        const __m128i v1 =
+            _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt1_16b, d0));
+        const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+        const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+        const __m128i e0 =
+            _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+        const __m128i err0 = _mm_madd_epi16(e0, e0);
+        sum32 = _mm_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[1] * (flt1[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt1 += flt1_stride;
+      const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+      const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+      sum64 = _mm_add_epi64(sum64, sum64_0);
+      sum64 = _mm_add_epi64(sum64, sum64_1);
+    }
+  } else {
+    __m128i sum32 = _mm_setzero_si128();
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width - 16; j += 16) {
+        const __m128i d = xx_loadu_128(dat + j);
+        const __m128i s = xx_loadu_128(src + j);
+        const __m128i d0 = _mm_cvtepu8_epi16(d);
+        const __m128i d1 = _mm_cvtepu8_epi16(_mm_srli_si128(d, 8));
+        const __m128i s0 = _mm_cvtepu8_epi16(s);
+        const __m128i s1 = _mm_cvtepu8_epi16(_mm_srli_si128(s, 8));
+        const __m128i diff0 = _mm_sub_epi16(d0, s0);
+        const __m128i diff1 = _mm_sub_epi16(d1, s1);
+        const __m128i err0 = _mm_madd_epi16(diff0, diff0);
+        const __m128i err1 = _mm_madd_epi16(diff1, diff1);
+        sum32 = _mm_add_epi32(sum32, err0);
+        sum32 = _mm_add_epi32(sum32, err1);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t e = (int32_t)(dat[k]) - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+    const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+    const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+    sum64 = _mm_add_epi64(sum64_0, sum64_1);
+  }
+  int64_t sum[2];
+  xx_storeu_128(sum, sum64);
+  err += sum[0] + sum[1];
+  return err;
+}
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
index f776e84c7..2a792f14e 100644
--- a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
@@ -14,7 +14,7 @@
 #include <smmintrin.h>
 
 #include "aom_dsp/x86/synonyms.h"
-
+#include "aom_dsp/x86/synonyms_avx2.h"
 #include "aom/aom_integer.h"
 
 #include "av1/common/reconinter.h"
@@ -31,7 +31,7 @@ uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d,
   uint64_t csse;
 
   const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE);
-  const __m256i v_zext_q = _mm256_set1_epi64x(0xffffffff);
+  const __m256i v_zext_q = yy_set1_64_from_32i(0xffffffff);
 
   __m256i v_acc0_q = _mm256_setzero_si256();
 
diff --git a/third_party/aom/av1/exports_com b/third_party/aom/av1/exports_com
new file mode 100644
index 000000000..5c8e0e09d
--- /dev/null
+++ b/third_party/aom/av1/exports_com
@@ -0,0 +1,2 @@
+text aom_read_obu_header_and_size
+text av1_resize_frame420
diff --git a/third_party/aom/av1/exports_dec b/third_party/aom/av1/exports_dec
index 05860e8c0..daabf6766 100644
--- a/third_party/aom/av1/exports_dec
+++ b/third_party/aom/av1/exports_dec
@@ -1,2 +1,3 @@
 data aom_codec_av1_dx_algo
 text aom_codec_av1_dx
+text av1_add_film_grain
diff --git a/third_party/aom/av1/exports_test b/third_party/aom/av1/exports_test
new file mode 100644
index 000000000..dab377575
--- /dev/null
+++ b/third_party/aom/av1/exports_test
@@ -0,0 +1,2 @@
+text av1_get_fwd_txfm_cfg
+text av1_rtcd
-- 
cgit v1.2.3